diff --git a/.buildkite/sdist.yml b/.buildkite/sdist.yml deleted file mode 100644 index 9b94e3752..000000000 --- a/.buildkite/sdist.yml +++ /dev/null @@ -1,11 +0,0 @@ -steps: - - - command: "fab env clean make test sdist" - label: ":dizzy: :python:" - artifact_paths: "dist/*.tar.gz" - - wait - - trigger: "spacy-sdist-against-models" - label: ":dizzy: :hammer:" - build: - env: - SPACY_VERSION: "{$SPACY_VERSION}" diff --git a/.buildkite/train.yml b/.buildkite/train.yml deleted file mode 100644 index b257db87c..000000000 --- a/.buildkite/train.yml +++ /dev/null @@ -1,11 +0,0 @@ -steps: - - - command: "fab env clean make test wheel" - label: ":dizzy: :python:" - artifact_paths: "dist/*.whl" - - wait - - trigger: "spacy-train-from-wheel" - label: ":dizzy: :train:" - build: - env: - SPACY_VERSION: "{$SPACY_VERSION}" diff --git a/Makefile b/Makefile index cfdcdcd79..64bb0b57a 100644 --- a/Makefile +++ b/Makefile @@ -2,10 +2,26 @@ SHELL := /bin/bash PYVER := 3.6 VENV := ./env$(PYVER) -version := $(shell "bin/get-version.sh") +ifndef SPACY_EXTRAS +override SPACY_EXTRAS = spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core +endif -dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core +version := $(shell "bin/get-version.sh") +package := $(shell "bin/get-package.sh") + +ifndef SPACY_BIN +override SPACY_BIN = $(package)-$(version).pex +endif + +dist/$(SPACY_BIN) : wheelhouse/spacy-$(version).stamp + $(VENV)/bin/pex \ + -f ./wheelhouse \ + --no-index \ + --disable-cache \ + -m spacy \ + -o $@ \ + $(package)==$(version) \ + $(SPACY_EXTRAS) chmod a+rx $@ cp $@ dist/spacy.pex @@ -15,7 +31,8 @@ dist/pytest.pex : wheelhouse/pytest-*.whl wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* $(VENV)/bin/pip wheel . -w ./wheelhouse - $(VENV)/bin/pip wheel spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core -w ./wheelhouse + $(VENV)/bin/pip wheel $(SPACY_EXTRAS) -w ./wheelhouse + touch $@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex diff --git a/bin/__init__.py b/bin/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/bin/get-package.sh b/bin/get-package.sh new file mode 100755 index 000000000..d60b930b4 --- /dev/null +++ b/bin/get-package.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -e + +version=$(grep "__title__ = " spacy/about.py) +version=${version/__title__ = } +version=${version/\'/} +version=${version/\'/} +version=${version/\"/} +version=${version/\"/} + +echo $version diff --git a/bin/load_reddit.py b/bin/load_reddit.py deleted file mode 100644 index afddd3798..000000000 --- a/bin/load_reddit.py +++ /dev/null @@ -1,97 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import bz2 -import re -import srsly -import sys -import random -import datetime -import plac -from pathlib import Path - -_unset = object() - - -class Reddit(object): - """Stream cleaned comments from Reddit.""" - - pre_format_re = re.compile(r"^[`*~]") - post_format_re = re.compile(r"[`*~]$") - url_re = re.compile(r"\[([^]]+)\]\(%%URL\)") - link_re = re.compile(r"\[([^]]+)\]\(https?://[^\)]+\)") - - def __init__(self, file_path, meta_keys={"subreddit": "section"}): - """ - file_path (unicode / Path): Path to archive or directory of archives. - meta_keys (dict): Meta data key included in the Reddit corpus, mapped - to display name in Prodigy meta. - RETURNS (Reddit): The Reddit loader. - """ - self.meta = meta_keys - file_path = Path(file_path) - if not file_path.exists(): - raise IOError("Can't find file path: {}".format(file_path)) - if not file_path.is_dir(): - self.files = [file_path] - else: - self.files = list(file_path.iterdir()) - - def __iter__(self): - for file_path in self.iter_files(): - with bz2.open(str(file_path)) as f: - for line in f: - line = line.strip() - if not line: - continue - comment = srsly.json_loads(line) - if self.is_valid(comment): - text = self.strip_tags(comment["body"]) - yield {"text": text} - - def get_meta(self, item): - return {name: item.get(key, "n/a") for key, name in self.meta.items()} - - def iter_files(self): - for file_path in self.files: - yield file_path - - def strip_tags(self, text): - text = self.link_re.sub(r"\1", text) - text = text.replace(">", ">").replace("<", "<") - text = self.pre_format_re.sub("", text) - text = self.post_format_re.sub("", text) - text = re.sub(r"\s+", " ", text) - return text.strip() - - def is_valid(self, comment): - return ( - comment["body"] is not None - and comment["body"] != "[deleted]" - and comment["body"] != "[removed]" - ) - - -def main(path): - reddit = Reddit(path) - for comment in reddit: - print(srsly.json_dumps(comment)) - - -if __name__ == "__main__": - import socket - - try: - BrokenPipeError - except NameError: - BrokenPipeError = socket.error - try: - plac.call(main) - except BrokenPipeError: - import os, sys - - # Python flushes standard streams on exit; redirect remaining output - # to devnull to avoid another BrokenPipeError at shutdown - devnull = os.open(os.devnull, os.O_WRONLY) - os.dup2(devnull, sys.stdout.fileno()) - sys.exit(1) # Python exits with error code 1 on EPIPE diff --git a/bin/train_word_vectors.py b/bin/train_word_vectors.py deleted file mode 100644 index 663ce060d..000000000 --- a/bin/train_word_vectors.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function, unicode_literals, division - -import logging -from pathlib import Path -from collections import defaultdict -from gensim.models import Word2Vec -import plac -import spacy - -logger = logging.getLogger(__name__) - - -class Corpus(object): - def __init__(self, directory, nlp): - self.directory = directory - self.nlp = nlp - - def __iter__(self): - for text_loc in iter_dir(self.directory): - with text_loc.open("r", encoding="utf-8") as file_: - text = file_.read() - - # This is to keep the input to the blank model (which doesn't - # sentencize) from being too long. It works particularly well with - # the output of [WikiExtractor](https://github.com/attardi/wikiextractor) - paragraphs = text.split('\n\n') - for par in paragraphs: - yield [word.orth_ for word in self.nlp(par)] - - -def iter_dir(loc): - dir_path = Path(loc) - for fn_path in dir_path.iterdir(): - if fn_path.is_dir(): - for sub_path in fn_path.iterdir(): - yield sub_path - else: - yield fn_path - - -@plac.annotations( - lang=("ISO language code"), - in_dir=("Location of input directory"), - out_loc=("Location of output file"), - n_workers=("Number of workers", "option", "n", int), - size=("Dimension of the word vectors", "option", "d", int), - window=("Context window size", "option", "w", int), - min_count=("Min count", "option", "m", int), - negative=("Number of negative samples", "option", "g", int), - nr_iter=("Number of iterations", "option", "i", int), -) -def main( - lang, - in_dir, - out_loc, - negative=5, - n_workers=4, - window=5, - size=128, - min_count=10, - nr_iter=5, -): - logging.basicConfig( - format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO - ) - nlp = spacy.blank(lang) - corpus = Corpus(in_dir, nlp) - model = Word2Vec( - sentences=corpus, - size=size, - window=window, - min_count=min_count, - workers=n_workers, - sample=1e-5, - negative=negative, - ) - model.save(out_loc) - -if __name__ == "__main__": - plac.call(main) diff --git a/bin/ud/__init__.py b/bin/ud/__init__.py deleted file mode 100644 index 119c46ba4..000000000 --- a/bin/ud/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .conll17_ud_eval import main as ud_evaluate # noqa: F401 -from .ud_train import main as ud_train # noqa: F401 diff --git a/bin/ud/conll17_ud_eval.py b/bin/ud/conll17_ud_eval.py deleted file mode 100644 index 88acfabac..000000000 --- a/bin/ud/conll17_ud_eval.py +++ /dev/null @@ -1,614 +0,0 @@ -#!/usr/bin/env python -# flake8: noqa - -# CoNLL 2017 UD Parsing evaluation script. -# -# Compatible with Python 2.7 and 3.2+, can be used either as a module -# or a standalone executable. -# -# Copyright 2017 Institute of Formal and Applied Linguistics (UFAL), -# Faculty of Mathematics and Physics, Charles University, Czech Republic. -# -# Changelog: -# - [02 Jan 2017] Version 0.9: Initial release -# - [25 Jan 2017] Version 0.9.1: Fix bug in LCS alignment computation -# - [10 Mar 2017] Version 1.0: Add documentation and test -# Compare HEADs correctly using aligned words -# Allow evaluation with errorneous spaces in forms -# Compare forms in LCS case insensitively -# Detect cycles and multiple root nodes -# Compute AlignedAccuracy - -# Command line usage -# ------------------ -# conll17_ud_eval.py [-v] [-w weights_file] gold_conllu_file system_conllu_file -# -# - if no -v is given, only the CoNLL17 UD Shared Task evaluation LAS metrics -# is printed -# - if -v is given, several metrics are printed (as precision, recall, F1 score, -# and in case the metric is computed on aligned words also accuracy on these): -# - Tokens: how well do the gold tokens match system tokens -# - Sentences: how well do the gold sentences match system sentences -# - Words: how well can the gold words be aligned to system words -# - UPOS: using aligned words, how well does UPOS match -# - XPOS: using aligned words, how well does XPOS match -# - Feats: using aligned words, how well does FEATS match -# - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match -# - Lemmas: using aligned words, how well does LEMMA match -# - UAS: using aligned words, how well does HEAD match -# - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match -# - if weights_file is given (with lines containing deprel-weight pairs), -# one more metric is shown: -# - WeightedLAS: as LAS, but each deprel (ignoring subtypes) has different weight - -# API usage -# --------- -# - load_conllu(file) -# - loads CoNLL-U file from given file object to an internal representation -# - the file object should return str on both Python 2 and Python 3 -# - raises UDError exception if the given file cannot be loaded -# - evaluate(gold_ud, system_ud) -# - evaluate the given gold and system CoNLL-U files (loaded with load_conllu) -# - raises UDError if the concatenated tokens of gold and system file do not match -# - returns a dictionary with the metrics described above, each metrics having -# four fields: precision, recall, f1 and aligned_accuracy (when using aligned -# words, otherwise this is None) - -# Description of token matching -# ----------------------------- -# In order to match tokens of gold file and system file, we consider the text -# resulting from concatenation of gold tokens and text resulting from -# concatenation of system tokens. These texts should match -- if they do not, -# the evaluation fails. -# -# If the texts do match, every token is represented as a range in this original -# text, and tokens are equal only if their range is the same. - -# Description of word matching -# ---------------------------- -# When matching words of gold file and system file, we first match the tokens. -# The words which are also tokens are matched as tokens, but words in multi-word -# tokens have to be handled differently. -# -# To handle multi-word tokens, we start by finding "multi-word spans". -# Multi-word span is a span in the original text such that -# - it contains at least one multi-word token -# - all multi-word tokens in the span (considering both gold and system ones) -# are completely inside the span (i.e., they do not "stick out") -# - the multi-word span is as small as possible -# -# For every multi-word span, we align the gold and system words completely -# inside this span using LCS on their FORMs. The words not intersecting -# (even partially) any multi-word span are then aligned as tokens. - - -from __future__ import division -from __future__ import print_function - -import argparse -import io -import sys -import unittest - -# CoNLL-U column names -ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) - -# UD Error is used when raising exceptions in this module -class UDError(Exception): - pass - -# Load given CoNLL-U file into internal representation -def load_conllu(file, check_parse=True): - # Internal representation classes - class UDRepresentation: - def __init__(self): - # Characters of all the tokens in the whole file. - # Whitespace between tokens is not included. - self.characters = [] - # List of UDSpan instances with start&end indices into `characters`. - self.tokens = [] - # List of UDWord instances. - self.words = [] - # List of UDSpan instances with start&end indices into `characters`. - self.sentences = [] - class UDSpan: - def __init__(self, start, end, characters): - self.start = start - # Note that self.end marks the first position **after the end** of span, - # so we can use characters[start:end] or range(start, end). - self.end = end - self.characters = characters - - @property - def text(self): - return ''.join(self.characters[self.start:self.end]) - - def __str__(self): - return self.text - - def __repr__(self): - return self.text - class UDWord: - def __init__(self, span, columns, is_multiword): - # Span of this word (or MWT, see below) within ud_representation.characters. - self.span = span - # 10 columns of the CoNLL-U file: ID, FORM, LEMMA,... - self.columns = columns - # is_multiword==True means that this word is part of a multi-word token. - # In that case, self.span marks the span of the whole multi-word token. - self.is_multiword = is_multiword - # Reference to the UDWord instance representing the HEAD (or None if root). - self.parent = None - # Let's ignore language-specific deprel subtypes. - self.columns[DEPREL] = columns[DEPREL].split(':')[0] - - ud = UDRepresentation() - - # Load the CoNLL-U file - index, sentence_start = 0, None - linenum = 0 - while True: - line = file.readline() - linenum += 1 - if not line: - break - line = line.rstrip("\r\n") - - # Handle sentence start boundaries - if sentence_start is None: - # Skip comments - if line.startswith("#"): - continue - # Start a new sentence - ud.sentences.append(UDSpan(index, 0, ud.characters)) - sentence_start = len(ud.words) - if not line: - # Add parent UDWord links and check there are no cycles - def process_word(word): - if word.parent == "remapping": - raise UDError("There is a cycle in a sentence") - if word.parent is None: - head = int(word.columns[HEAD]) - if head > len(ud.words) - sentence_start: - raise UDError("Line {}: HEAD '{}' points outside of the sentence".format( - linenum, word.columns[HEAD])) - if head: - parent = ud.words[sentence_start + head - 1] - word.parent = "remapping" - process_word(parent) - word.parent = parent - - for word in ud.words[sentence_start:]: - process_word(word) - - # Check there is a single root node - if check_parse: - if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1: - raise UDError("There are multiple roots in a sentence") - - # End the sentence - ud.sentences[-1].end = index - sentence_start = None - continue - - # Read next token/word - columns = line.split("\t") - if len(columns) != 10: - raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, line)) - - # Skip empty nodes - if "." in columns[ID]: - continue - - # Delete spaces from FORM so gold.characters == system.characters - # even if one of them tokenizes the space. - columns[FORM] = columns[FORM].replace(" ", "") - if not columns[FORM]: - raise UDError("There is an empty FORM in the CoNLL-U file -- line %d" % linenum) - - # Save token - ud.characters.extend(columns[FORM]) - ud.tokens.append(UDSpan(index, index + len(columns[FORM]), ud.characters)) - index += len(columns[FORM]) - - # Handle multi-word tokens to save word(s) - if "-" in columns[ID]: - try: - start, end = map(int, columns[ID].split("-")) - except: - raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID])) - - for _ in range(start, end + 1): - word_line = file.readline().rstrip("\r\n") - word_columns = word_line.split("\t") - if len(word_columns) != 10: - print(columns) - raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, word_line)) - ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) - # Basic tokens/words - else: - try: - word_id = int(columns[ID]) - except: - raise UDError("Cannot parse word ID '{}'".format(columns[ID])) - if word_id != len(ud.words) - sentence_start + 1: - raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1)) - - try: - head_id = int(columns[HEAD]) - except: - raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD])) - if head_id < 0: - raise UDError("HEAD cannot be negative") - - ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False)) - - if sentence_start is not None: - raise UDError("The CoNLL-U file does not end with empty line") - - return ud - -# Evaluate the gold and system treebanks (loaded using load_conllu). -def evaluate(gold_ud, system_ud, deprel_weights=None, check_parse=True): - class Score: - def __init__(self, gold_total, system_total, correct, aligned_total=None, undersegmented=None, oversegmented=None): - self.precision = correct / system_total if system_total else 0.0 - self.recall = correct / gold_total if gold_total else 0.0 - self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0 - self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total - self.undersegmented = undersegmented - self.oversegmented = oversegmented - self.under_perc = len(undersegmented) / gold_total if gold_total and undersegmented else 0.0 - self.over_perc = len(oversegmented) / gold_total if gold_total and oversegmented else 0.0 - class AlignmentWord: - def __init__(self, gold_word, system_word): - self.gold_word = gold_word - self.system_word = system_word - self.gold_parent = None - self.system_parent_gold_aligned = None - class Alignment: - def __init__(self, gold_words, system_words): - self.gold_words = gold_words - self.system_words = system_words - self.matched_words = [] - self.matched_words_map = {} - def append_aligned_words(self, gold_word, system_word): - self.matched_words.append(AlignmentWord(gold_word, system_word)) - self.matched_words_map[system_word] = gold_word - def fill_parents(self): - # We represent root parents in both gold and system data by '0'. - # For gold data, we represent non-root parent by corresponding gold word. - # For system data, we represent non-root parent by either gold word aligned - # to parent system nodes, or by None if no gold words is aligned to the parent. - for words in self.matched_words: - words.gold_parent = words.gold_word.parent if words.gold_word.parent is not None else 0 - words.system_parent_gold_aligned = self.matched_words_map.get(words.system_word.parent, None) \ - if words.system_word.parent is not None else 0 - - def lower(text): - if sys.version_info < (3, 0) and isinstance(text, str): - return text.decode("utf-8").lower() - return text.lower() - - def spans_score(gold_spans, system_spans): - correct, gi, si = 0, 0, 0 - undersegmented = [] - oversegmented = [] - combo = 0 - previous_end_si_earlier = False - previous_end_gi_earlier = False - while gi < len(gold_spans) and si < len(system_spans): - previous_si = system_spans[si-1] if si > 0 else None - previous_gi = gold_spans[gi-1] if gi > 0 else None - if system_spans[si].start < gold_spans[gi].start: - # avoid counting the same mistake twice - if not previous_end_si_earlier: - combo += 1 - oversegmented.append(str(previous_gi).strip()) - si += 1 - elif gold_spans[gi].start < system_spans[si].start: - # avoid counting the same mistake twice - if not previous_end_gi_earlier: - combo += 1 - undersegmented.append(str(previous_si).strip()) - gi += 1 - else: - correct += gold_spans[gi].end == system_spans[si].end - if gold_spans[gi].end < system_spans[si].end: - undersegmented.append(str(system_spans[si]).strip()) - previous_end_gi_earlier = True - previous_end_si_earlier = False - elif gold_spans[gi].end > system_spans[si].end: - oversegmented.append(str(gold_spans[gi]).strip()) - previous_end_si_earlier = True - previous_end_gi_earlier = False - else: - previous_end_gi_earlier = False - previous_end_si_earlier = False - si += 1 - gi += 1 - - return Score(len(gold_spans), len(system_spans), correct, None, undersegmented, oversegmented) - - def alignment_score(alignment, key_fn, weight_fn=lambda w: 1): - gold, system, aligned, correct = 0, 0, 0, 0 - - for word in alignment.gold_words: - gold += weight_fn(word) - - for word in alignment.system_words: - system += weight_fn(word) - - for words in alignment.matched_words: - aligned += weight_fn(words.gold_word) - - if key_fn is None: - # Return score for whole aligned words - return Score(gold, system, aligned) - - for words in alignment.matched_words: - if key_fn(words.gold_word, words.gold_parent) == key_fn(words.system_word, words.system_parent_gold_aligned): - correct += weight_fn(words.gold_word) - - return Score(gold, system, correct, aligned) - - def beyond_end(words, i, multiword_span_end): - if i >= len(words): - return True - if words[i].is_multiword: - return words[i].span.start >= multiword_span_end - return words[i].span.end > multiword_span_end - - def extend_end(word, multiword_span_end): - if word.is_multiword and word.span.end > multiword_span_end: - return word.span.end - return multiword_span_end - - def find_multiword_span(gold_words, system_words, gi, si): - # We know gold_words[gi].is_multiword or system_words[si].is_multiword. - # Find the start of the multiword span (gs, ss), so the multiword span is minimal. - # Initialize multiword_span_end characters index. - if gold_words[gi].is_multiword: - multiword_span_end = gold_words[gi].span.end - if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start: - si += 1 - else: # if system_words[si].is_multiword - multiword_span_end = system_words[si].span.end - if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start: - gi += 1 - gs, ss = gi, si - - # Find the end of the multiword span - # (so both gi and si are pointing to the word following the multiword span end). - while not beyond_end(gold_words, gi, multiword_span_end) or \ - not beyond_end(system_words, si, multiword_span_end): - if gi < len(gold_words) and (si >= len(system_words) or - gold_words[gi].span.start <= system_words[si].span.start): - multiword_span_end = extend_end(gold_words[gi], multiword_span_end) - gi += 1 - else: - multiword_span_end = extend_end(system_words[si], multiword_span_end) - si += 1 - return gs, ss, gi, si - - def compute_lcs(gold_words, system_words, gi, si, gs, ss): - lcs = [[0] * (si - ss) for i in range(gi - gs)] - for g in reversed(range(gi - gs)): - for s in reversed(range(si - ss)): - if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): - lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0) - lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0) - lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0) - return lcs - - def align_words(gold_words, system_words): - alignment = Alignment(gold_words, system_words) - - gi, si = 0, 0 - while gi < len(gold_words) and si < len(system_words): - if gold_words[gi].is_multiword or system_words[si].is_multiword: - # A: Multi-word tokens => align via LCS within the whole "multiword span". - gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si) - - if si > ss and gi > gs: - lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss) - - # Store aligned words - s, g = 0, 0 - while g < gi - gs and s < si - ss: - if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): - alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s]) - g += 1 - s += 1 - elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0): - g += 1 - else: - s += 1 - else: - # B: No multi-word token => align according to spans. - if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end): - alignment.append_aligned_words(gold_words[gi], system_words[si]) - gi += 1 - si += 1 - elif gold_words[gi].span.start <= system_words[si].span.start: - gi += 1 - else: - si += 1 - - alignment.fill_parents() - - return alignment - - # Check that underlying character sequences do match - if gold_ud.characters != system_ud.characters: - index = 0 - while gold_ud.characters[index] == system_ud.characters[index]: - index += 1 - - raise UDError( - "The concatenation of tokens in gold file and in system file differ!\n" + - "First 20 differing characters in gold file: '{}' and system file: '{}'".format( - "".join(gold_ud.characters[index:index + 20]), - "".join(system_ud.characters[index:index + 20]) - ) - ) - - # Align words - alignment = align_words(gold_ud.words, system_ud.words) - - # Compute the F1-scores - if check_parse: - result = { - "Tokens": spans_score(gold_ud.tokens, system_ud.tokens), - "Sentences": spans_score(gold_ud.sentences, system_ud.sentences), - "Words": alignment_score(alignment, None), - "UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]), - "XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]), - "Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]), - "AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])), - "Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]), - "UAS": alignment_score(alignment, lambda w, parent: parent), - "LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])), - } - else: - result = { - "Tokens": spans_score(gold_ud.tokens, system_ud.tokens), - "Sentences": spans_score(gold_ud.sentences, system_ud.sentences), - "Words": alignment_score(alignment, None), - "Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]), - "Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]), - } - - - # Add WeightedLAS if weights are given - if deprel_weights is not None: - def weighted_las(word): - return deprel_weights.get(word.columns[DEPREL], 1.0) - result["WeightedLAS"] = alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL]), weighted_las) - - return result - -def load_deprel_weights(weights_file): - if weights_file is None: - return None - - deprel_weights = {} - for line in weights_file: - # Ignore comments and empty lines - if line.startswith("#") or not line.strip(): - continue - - columns = line.rstrip("\r\n").split() - if len(columns) != 2: - raise ValueError("Expected two columns in the UD Relations weights file on line '{}'".format(line)) - - deprel_weights[columns[0]] = float(columns[1]) - - return deprel_weights - -def load_conllu_file(path): - _file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {})) - return load_conllu(_file) - -def evaluate_wrapper(args): - # Load CoNLL-U files - gold_ud = load_conllu_file(args.gold_file) - system_ud = load_conllu_file(args.system_file) - - # Load weights if requested - deprel_weights = load_deprel_weights(args.weights) - - return evaluate(gold_ud, system_ud, deprel_weights) - -def main(): - # Parse arguments - parser = argparse.ArgumentParser() - parser.add_argument("gold_file", type=str, - help="Name of the CoNLL-U file with the gold data.") - parser.add_argument("system_file", type=str, - help="Name of the CoNLL-U file with the predicted data.") - parser.add_argument("--weights", "-w", type=argparse.FileType("r"), default=None, - metavar="deprel_weights_file", - help="Compute WeightedLAS using given weights for Universal Dependency Relations.") - parser.add_argument("--verbose", "-v", default=0, action="count", - help="Print all metrics.") - args = parser.parse_args() - - # Use verbose if weights are supplied - if args.weights is not None and not args.verbose: - args.verbose = 1 - - # Evaluate - evaluation = evaluate_wrapper(args) - - # Print the evaluation - if not args.verbose: - print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1)) - else: - metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS"] - if args.weights is not None: - metrics.append("WeightedLAS") - - print("Metrics | Precision | Recall | F1 Score | AligndAcc") - print("-----------+-----------+-----------+-----------+-----------") - for metric in metrics: - print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format( - metric, - 100 * evaluation[metric].precision, - 100 * evaluation[metric].recall, - 100 * evaluation[metric].f1, - "{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else "" - )) - -if __name__ == "__main__": - main() - -# Tests, which can be executed with `python -m unittest conll17_ud_eval`. -class TestAlignment(unittest.TestCase): - @staticmethod - def _load_words(words): - """Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors.""" - lines, num_words = [], 0 - for w in words: - parts = w.split(" ") - if len(parts) == 1: - num_words += 1 - lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1))) - else: - lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0])) - for part in parts[1:]: - num_words += 1 - lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1))) - return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"]))) - - def _test_exception(self, gold, system): - self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system)) - - def _test_ok(self, gold, system, correct): - metrics = evaluate(self._load_words(gold), self._load_words(system)) - gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold)) - system_words = sum((max(1, len(word.split(" ")) - 1) for word in system)) - self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1), - (correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words))) - - def test_exception(self): - self._test_exception(["a"], ["b"]) - - def test_equal(self): - self._test_ok(["a"], ["a"], 1) - self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3) - - def test_equal_with_multiword(self): - self._test_ok(["abc a b c"], ["a", "b", "c"], 3) - self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4) - self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4) - self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5) - - def test_alignment(self): - self._test_ok(["abcd"], ["a", "b", "c", "d"], 0) - self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1) - self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2) - self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2) - self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4) - self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2) - self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1) diff --git a/bin/ud/run_eval.py b/bin/ud/run_eval.py deleted file mode 100644 index 3a30c0ee9..000000000 --- a/bin/ud/run_eval.py +++ /dev/null @@ -1,297 +0,0 @@ -import spacy -import time -import re -import plac -import operator -import datetime -from pathlib import Path -import xml.etree.ElementTree as ET - -import conll17_ud_eval -from ud_train import write_conllu -from spacy.lang.lex_attrs import word_shape -from spacy.util import get_lang_class - -# All languages in spaCy format (note that Norwegian is 'no' in UD - gets remapped later) -ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, eu, fa, fi, fr," - "ga, gu, he, hi, hr, hu, hy, id, is, it, ja, kn, ko, lb, lij, lt, lv, ml, mr, nb," - "nl, pl, pt, ro, ru, si, sk, sl, sq, sr, sv, ta, te, th, tl," - "tr, tt, uk, ur, vi, yo, zh") - -# Non-parsing tasks that will be evaluated (works for default models) -EVAL_NO_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats'] - -# Tasks that will be evaluated if check_parse=True (does not work for default models) -EVAL_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats', 'UPOS', 'XPOS', 'AllTags', 'UAS', 'LAS'] - -# Minimum frequency an error should have to be printed -PRINT_FREQ = 20 - -# Maximum number of errors printed per category -PRINT_TOTAL = 10 - -space_re = re.compile("\s+") - - -def load_model(modelname, add_sentencizer=False): - """ Load a specific spaCy model """ - loading_start = time.time() - nlp = spacy.load(modelname) - if add_sentencizer: - nlp.add_pipe(nlp.create_pipe('sentencizer')) - loading_end = time.time() - loading_time = loading_end - loading_start - if add_sentencizer: - return nlp, loading_time, modelname + '_sentencizer' - return nlp, loading_time, modelname - - -def load_default_model_sentencizer(lang): - """ Load a generic spaCy model and add the sentencizer for sentence tokenization""" - loading_start = time.time() - lang_class = get_lang_class(lang) - nlp = lang_class() - nlp.add_pipe(nlp.create_pipe('sentencizer')) - loading_end = time.time() - loading_time = loading_end - loading_start - return nlp, loading_time, lang + "_default_" + 'sentencizer' - - -def split_text(text): - return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")] - - -def get_freq_tuples(my_list, print_total_threshold): - """ Turn a list of errors into frequency-sorted tuples thresholded by a certain total number """ - d = {} - for token in my_list: - d.setdefault(token, 0) - d[token] += 1 - return sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:print_total_threshold] - - -def _contains_blinded_text(stats_xml): - """ Heuristic to determine whether the treebank has blinded texts or not """ - tree = ET.parse(stats_xml) - root = tree.getroot() - total_tokens = int(root.find('size/total/tokens').text) - unique_forms = int(root.find('forms').get('unique')) - - # assume the corpus is largely blinded when there are less than 1% unique tokens - return (unique_forms / total_tokens) < 0.01 - - -def fetch_all_treebanks(ud_dir, languages, corpus, best_per_language): - """" Fetch the txt files for all treebanks for a given set of languages """ - all_treebanks = dict() - treebank_size = dict() - for l in languages: - all_treebanks[l] = [] - treebank_size[l] = 0 - - for treebank_dir in ud_dir.iterdir(): - if treebank_dir.is_dir(): - for txt_path in treebank_dir.iterdir(): - if txt_path.name.endswith('-ud-' + corpus + '.txt'): - file_lang = txt_path.name.split('_')[0] - if file_lang in languages: - gold_path = treebank_dir / txt_path.name.replace('.txt', '.conllu') - stats_xml = treebank_dir / "stats.xml" - # ignore treebanks where the texts are not publicly available - if not _contains_blinded_text(stats_xml): - if not best_per_language: - all_treebanks[file_lang].append(txt_path) - # check the tokens in the gold annotation to keep only the biggest treebank per language - else: - with gold_path.open(mode='r', encoding='utf-8') as gold_file: - gold_ud = conll17_ud_eval.load_conllu(gold_file) - gold_tokens = len(gold_ud.tokens) - if treebank_size[file_lang] < gold_tokens: - all_treebanks[file_lang] = [txt_path] - treebank_size[file_lang] = gold_tokens - - return all_treebanks - - -def run_single_eval(nlp, loading_time, print_name, text_path, gold_ud, tmp_output_path, out_file, print_header, - check_parse, print_freq_tasks): - """" Run an evaluation of a model nlp on a certain specified treebank """ - with text_path.open(mode='r', encoding='utf-8') as f: - flat_text = f.read() - - # STEP 1: tokenize text - tokenization_start = time.time() - texts = split_text(flat_text) - docs = list(nlp.pipe(texts)) - tokenization_end = time.time() - tokenization_time = tokenization_end - tokenization_start - - # STEP 2: record stats and timings - tokens_per_s = int(len(gold_ud.tokens) / tokenization_time) - - print_header_1 = ['date', 'text_path', 'gold_tokens', 'model', 'loading_time', 'tokenization_time', 'tokens_per_s'] - print_string_1 = [str(datetime.date.today()), text_path.name, len(gold_ud.tokens), - print_name, "%.2f" % loading_time, "%.2f" % tokenization_time, tokens_per_s] - - # STEP 3: evaluate predicted tokens and features - with tmp_output_path.open(mode="w", encoding="utf8") as tmp_out_file: - write_conllu(docs, tmp_out_file) - with tmp_output_path.open(mode="r", encoding="utf8") as sys_file: - sys_ud = conll17_ud_eval.load_conllu(sys_file, check_parse=check_parse) - tmp_output_path.unlink() - scores = conll17_ud_eval.evaluate(gold_ud, sys_ud, check_parse=check_parse) - - # STEP 4: format the scoring results - eval_headers = EVAL_PARSE - if not check_parse: - eval_headers = EVAL_NO_PARSE - - for score_name in eval_headers: - score = scores[score_name] - print_string_1.extend(["%.2f" % score.precision, - "%.2f" % score.recall, - "%.2f" % score.f1]) - print_string_1.append("-" if score.aligned_accuracy is None else "%.2f" % score.aligned_accuracy) - print_string_1.append("-" if score.undersegmented is None else "%.4f" % score.under_perc) - print_string_1.append("-" if score.oversegmented is None else "%.4f" % score.over_perc) - - print_header_1.extend([score_name + '_p', score_name + '_r', score_name + '_F', score_name + '_acc', - score_name + '_under', score_name + '_over']) - - if score_name in print_freq_tasks: - print_header_1.extend([score_name + '_word_under_ex', score_name + '_shape_under_ex', - score_name + '_word_over_ex', score_name + '_shape_over_ex']) - - d_under_words = get_freq_tuples(score.undersegmented, PRINT_TOTAL) - d_under_shapes = get_freq_tuples([word_shape(x) for x in score.undersegmented], PRINT_TOTAL) - d_over_words = get_freq_tuples(score.oversegmented, PRINT_TOTAL) - d_over_shapes = get_freq_tuples([word_shape(x) for x in score.oversegmented], PRINT_TOTAL) - - # saving to CSV with ; seperator so blinding ; in the example output - print_string_1.append( - str({k: v for k, v in d_under_words if v > PRINT_FREQ}).replace(";", "*SEMICOLON*")) - print_string_1.append( - str({k: v for k, v in d_under_shapes if v > PRINT_FREQ}).replace(";", "*SEMICOLON*")) - print_string_1.append( - str({k: v for k, v in d_over_words if v > PRINT_FREQ}).replace(";", "*SEMICOLON*")) - print_string_1.append( - str({k: v for k, v in d_over_shapes if v > PRINT_FREQ}).replace(";", "*SEMICOLON*")) - - # STEP 5: print the formatted results to CSV - if print_header: - out_file.write(';'.join(map(str, print_header_1)) + '\n') - out_file.write(';'.join(map(str, print_string_1)) + '\n') - - -def run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks): - """" Run an evaluation for each language with its specified models and treebanks """ - print_header = True - - for tb_lang, treebank_list in treebanks.items(): - print() - print("Language", tb_lang) - for text_path in treebank_list: - print(" Evaluating on", text_path) - - gold_path = text_path.parent / (text_path.stem + '.conllu') - print(" Gold data from ", gold_path) - - # nested try blocks to ensure the code can continue with the next iteration after a failure - try: - with gold_path.open(mode='r', encoding='utf-8') as gold_file: - gold_ud = conll17_ud_eval.load_conllu(gold_file) - - for nlp, nlp_loading_time, nlp_name in models[tb_lang]: - try: - print(" Benchmarking", nlp_name) - tmp_output_path = text_path.parent / str('tmp_' + nlp_name + '.conllu') - run_single_eval(nlp, nlp_loading_time, nlp_name, text_path, gold_ud, tmp_output_path, out_file, - print_header, check_parse, print_freq_tasks) - print_header = False - except Exception as e: - print(" Ran into trouble: ", str(e)) - except Exception as e: - print(" Ran into trouble: ", str(e)) - - -@plac.annotations( - out_path=("Path to output CSV file", "positional", None, Path), - ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), - check_parse=("Set flag to evaluate parsing performance", "flag", "p", bool), - langs=("Enumeration of languages to evaluate (default: all)", "option", "l", str), - exclude_trained_models=("Set flag to exclude trained models", "flag", "t", bool), - exclude_multi=("Set flag to exclude the multi-language model as default baseline", "flag", "m", bool), - hide_freq=("Set flag to avoid printing out more detailed high-freq tokenization errors", "flag", "f", bool), - corpus=("Whether to run on train, dev or test", "option", "c", str), - best_per_language=("Set flag to only keep the largest treebank for each language", "flag", "b", bool) -) -def main(out_path, ud_dir, check_parse=False, langs=ALL_LANGUAGES, exclude_trained_models=False, exclude_multi=False, - hide_freq=False, corpus='train', best_per_language=False): - """" - Assemble all treebanks and models to run evaluations with. - When setting check_parse to True, the default models will not be evaluated as they don't have parsing functionality - """ - languages = [lang.strip() for lang in langs.split(",")] - - print_freq_tasks = [] - if not hide_freq: - print_freq_tasks = ['Tokens'] - - # fetching all relevant treebank from the directory - treebanks = fetch_all_treebanks(ud_dir, languages, corpus, best_per_language) - - print() - print("Loading all relevant models for", languages) - models = dict() - - # multi-lang model - multi = None - if not exclude_multi and not check_parse: - multi = load_model('xx_ent_wiki_sm', add_sentencizer=True) - - # initialize all models with the multi-lang model - for lang in languages: - UD_lang = lang - # Norwegian is 'nb' in spaCy but 'no' in the UD corpora - if lang == "nb": - UD_lang = "no" - try: - models[UD_lang] = [multi] if multi else [] - # add default models if we don't want to evaluate parsing info - if not check_parse: - models[UD_lang].append(load_default_model_sentencizer(lang)) - except: - print(f"Exception initializing lang {lang} - skipping") - - # language-specific trained models - if not exclude_trained_models: - news_languages = ["da", "de", "el", "es", "fr", "it", "ja", "lt", "nb", "nl", "pl", "pt", "ro"] - news_languages = ["nb"] - web_languages = ["en", "zh"] - sizes = ["sm", "md", "lg"] - for lang in web_languages: - UD_lang = lang - for size in sizes: - model_name = f'{lang}_core_web_{size}' - try: - models[UD_lang].append(load_model(model_name)) - except Exception as e: - print(f"Error loading {model_name}: {e}") - - for lang in news_languages: - UD_lang = lang - if lang == "nb": - UD_lang = "no" - for size in sizes: - model_name = f'{lang}_core_news_{size}' - try: - models[UD_lang].append(load_model(model_name)) - except Exception as e: - print(f"Error loading {model_name}: {e}") - - with out_path.open(mode='w', encoding='utf-8') as out_file: - run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks) - - -if __name__ == "__main__": - plac.call(main) diff --git a/bin/ud/ud_run_test.py b/bin/ud/ud_run_test.py deleted file mode 100644 index 70c6be0d0..000000000 --- a/bin/ud/ud_run_test.py +++ /dev/null @@ -1,324 +0,0 @@ -# flake8: noqa -"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes -.conllu format for development data, allowing the official scorer to be used. -""" -from __future__ import unicode_literals - -import plac -from pathlib import Path -import re -import sys -import srsly - -import spacy -import spacy.util -from spacy.tokens import Token, Doc -from spacy.matcher import Matcher - - -Fused_begin = None -Fused_inside = None - -from . import conll17_ud_eval - -from spacy import lang -from spacy.lang import zh -from spacy.lang import ja -from spacy.lang import ru - - -################ -# Data reading # -################ - -space_re = re.compile(r"\s+") - - -def split_text(text): - return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")] - - -############## -# Evaluation # -############## - - -def read_conllu(file_): - docs = [] - sent = [] - doc = [] - for line in file_: - if line.startswith("# newdoc"): - if doc: - docs.append(doc) - doc = [] - elif line.startswith("#"): - continue - elif not line.strip(): - if sent: - doc.append(sent) - sent = [] - else: - sent.append(list(line.strip().split("\t"))) - if len(sent[-1]) != 10: - print(repr(line)) - raise ValueError - if sent: - doc.append(sent) - if doc: - docs.append(doc) - return docs - - -def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): - if text_loc.parts[-1].endswith(".conllu"): - docs = [] - with text_loc.open(encoding="utf8") as file_: - for conllu_doc in read_conllu(file_): - for conllu_sent in conllu_doc: - words = [line[1] for line in conllu_sent] - docs.append(Doc(nlp.vocab, words=words)) - for name, component in nlp.pipeline: - docs = list(component.pipe(docs)) - else: - with text_loc.open("r", encoding="utf8") as text_file: - texts = split_text(text_file.read()) - docs = list(nlp.pipe(texts)) - with sys_loc.open("w", encoding="utf8") as out_file: - write_conllu(docs, out_file) - with gold_loc.open("r", encoding="utf8") as gold_file: - gold_ud = conll17_ud_eval.load_conllu(gold_file) - with sys_loc.open("r", encoding="utf8") as sys_file: - sys_ud = conll17_ud_eval.load_conllu(sys_file) - scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) - return docs, scores - - -def write_conllu(docs, file_): - merger = Matcher(docs[0].vocab) - merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) - for i, doc in enumerate(docs): - matches = [] - if doc.is_parsed: - matches = merger(doc) - spans = [doc[start : end + 1] for _, start, end in matches] - with doc.retokenize() as retokenizer: - for span in spans: - retokenizer.merge(span) - file_.write("# newdoc id = {i}\n".format(i=i)) - for j, sent in enumerate(doc.sents): - file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) - file_.write("# text = {text}\n".format(text=sent.text)) - for k, token in enumerate(sent): - file_.write(_get_token_conllu(token, k, len(sent)) + "\n") - file_.write("\n") - for word in sent: - if word.head.i == word.i and word.dep_ == "ROOT": - break - else: - print("Rootless sentence!") - print(sent) - print(i) - for w in sent: - print(w.i, w.text, w.head.text, w.head.i, w.dep_) - raise ValueError - - -def _get_token_conllu(token, k, sent_len): - if token.check_morph(Fused_begin) and (k + 1 < sent_len): - n = 1 - text = [token.text] - while token.nbor(n).check_morph(Fused_inside): - text.append(token.nbor(n).text) - n += 1 - id_ = "%d-%d" % (k + 1, (k + n)) - fields = [id_, "".join(text)] + ["_"] * 8 - lines = ["\t".join(fields)] - else: - lines = [] - if token.head.i == token.i: - head = 0 - else: - head = k + (token.head.i - token.i) + 1 - fields = [ - str(k + 1), - token.text, - token.lemma_, - token.pos_, - token.tag_, - "_", - str(head), - token.dep_.lower(), - "_", - "_", - ] - if token.check_morph(Fused_begin) and (k + 1 < sent_len): - if k == 0: - fields[1] = token.norm_[0].upper() + token.norm_[1:] - else: - fields[1] = token.norm_ - elif token.check_morph(Fused_inside): - fields[1] = token.norm_ - elif token._.split_start is not None: - split_start = token._.split_start - split_end = token._.split_end - split_len = (split_end.i - split_start.i) + 1 - n_in_split = token.i - split_start.i - subtokens = guess_fused_orths(split_start.text, [""] * split_len) - fields[1] = subtokens[n_in_split] - - lines.append("\t".join(fields)) - return "\n".join(lines) - - -def guess_fused_orths(word, ud_forms): - """The UD data 'fused tokens' don't necessarily expand to keys that match - the form. We need orths that exact match the string. Here we make a best - effort to divide up the word.""" - if word == "".join(ud_forms): - # Happy case: we get a perfect split, with each letter accounted for. - return ud_forms - elif len(word) == sum(len(subtoken) for subtoken in ud_forms): - # Unideal, but at least lengths match. - output = [] - remain = word - for subtoken in ud_forms: - assert len(subtoken) >= 1 - output.append(remain[: len(subtoken)]) - remain = remain[len(subtoken) :] - assert len(remain) == 0, (word, ud_forms, remain) - return output - else: - # Let's say word is 6 long, and there are three subtokens. The orths - # *must* equal the original string. Arbitrarily, split [4, 1, 1] - first = word[: len(word) - (len(ud_forms) - 1)] - output = [first] - remain = word[len(first) :] - for i in range(1, len(ud_forms)): - assert remain - output.append(remain[:1]) - remain = remain[1:] - assert len(remain) == 0, (word, output, remain) - return output - - -def print_results(name, ud_scores): - fields = {} - if ud_scores is not None: - fields.update( - { - "words": ud_scores["Words"].f1 * 100, - "sents": ud_scores["Sentences"].f1 * 100, - "tags": ud_scores["XPOS"].f1 * 100, - "uas": ud_scores["UAS"].f1 * 100, - "las": ud_scores["LAS"].f1 * 100, - } - ) - else: - fields.update({"words": 0.0, "sents": 0.0, "tags": 0.0, "uas": 0.0, "las": 0.0}) - tpl = "\t".join( - (name, "{las:.1f}", "{uas:.1f}", "{tags:.1f}", "{sents:.1f}", "{words:.1f}") - ) - print(tpl.format(**fields)) - return fields - - -def get_token_split_start(token): - if token.text == "": - assert token.i != 0 - i = -1 - while token.nbor(i).text == "": - i -= 1 - return token.nbor(i) - elif (token.i + 1) < len(token.doc) and token.nbor(1).text == "": - return token - else: - return None - - -def get_token_split_end(token): - if (token.i + 1) == len(token.doc): - return token if token.text == "" else None - elif token.text != "" and token.nbor(1).text != "": - return None - i = 1 - while (token.i + i) < len(token.doc) and token.nbor(i).text == "": - i += 1 - return token.nbor(i - 1) - - -################## -# Initialization # -################## - - -def load_nlp(experiments_dir, corpus): - nlp = spacy.load(experiments_dir / corpus / "best-model") - return nlp - - -def initialize_pipeline(nlp, examples, config, device): - nlp.add_pipe(nlp.create_pipe("parser")) - return nlp - - -@plac.annotations( - test_data_dir=( - "Path to Universal Dependencies test data", - "positional", - None, - Path, - ), - experiment_dir=("Parent directory with output model", "positional", None, Path), - corpus=( - "UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc", - "positional", - None, - str, - ), -) -def main(test_data_dir, experiment_dir, corpus): - Token.set_extension("split_start", getter=get_token_split_start) - Token.set_extension("split_end", getter=get_token_split_end) - Token.set_extension("begins_fused", default=False) - Token.set_extension("inside_fused", default=False) - lang.zh.Chinese.Defaults.use_jieba = False - lang.ja.Japanese.Defaults.use_janome = False - lang.ru.Russian.Defaults.use_pymorphy2 = False - - nlp = load_nlp(experiment_dir, corpus) - - treebank_code = nlp.meta["treebank"] - for section in ("test", "dev"): - if section == "dev": - section_dir = "conll17-ud-development-2017-03-19" - else: - section_dir = "conll17-ud-test-2017-05-09" - text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt") - udpipe_path = ( - test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu") - ) - gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu") - - header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"] - print("\t".join(header)) - inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path} - for input_type in ("udp", "raw"): - input_path = inputs[input_type] - output_path = ( - experiment_dir / corpus / "{section}.conllu".format(section=section) - ) - - parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path) - - accuracy = print_results(input_type, test_scores) - acc_path = ( - experiment_dir - / corpus - / "{section}-accuracy.json".format(section=section) - ) - srsly.write_json(acc_path, accuracy) - - -if __name__ == "__main__": - plac.call(main) diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py deleted file mode 100644 index 362057b37..000000000 --- a/bin/ud/ud_train.py +++ /dev/null @@ -1,559 +0,0 @@ -# flake8: noqa -"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes -.conllu format for development data, allowing the official scorer to be used. -""" -from __future__ import unicode_literals - -import plac -from pathlib import Path -import re -import json -import tqdm - -import spacy -import spacy.util -from bin.ud import conll17_ud_eval -from spacy.tokens import Token, Doc -from spacy.gold import Example -from spacy.util import compounding, minibatch -from spacy.gold.batchers import minibatch_by_words -from spacy.pipeline._parser_internals.nonproj import projectivize -from spacy.matcher import Matcher -from spacy import displacy -from collections import defaultdict - -import random - -from spacy import lang -from spacy.lang import zh -from spacy.lang import ja - -try: - import torch -except ImportError: - torch = None - - -################ -# Data reading # -################ - -space_re = re.compile("\s+") - - -def split_text(text): - return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")] - - -def read_data( - nlp, - conllu_file, - text_file, - raw_text=True, - oracle_segments=False, - max_doc_length=None, - limit=None, -): - """Read the CONLLU format into Example objects. If raw_text=True, - include Doc objects created using nlp.make_doc and then aligned against - the gold-standard sequences. If oracle_segments=True, include Doc objects - created from the gold-standard segments. At least one must be True.""" - if not raw_text and not oracle_segments: - raise ValueError("At least one of raw_text or oracle_segments must be True") - paragraphs = split_text(text_file.read()) - conllu = read_conllu(conllu_file) - # sd is spacy doc; cd is conllu doc - # cs is conllu sent, ct is conllu token - docs = [] - golds = [] - for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)): - sent_annots = [] - for cs in cd: - sent = defaultdict(list) - for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs: - if "." in id_: - continue - if "-" in id_: - continue - id_ = int(id_) - 1 - head = int(head) - 1 if head != "0" else id_ - sent["words"].append(word) - sent["tags"].append(tag) - sent["morphs"].append(_compile_morph_string(morph, pos)) - sent["heads"].append(head) - sent["deps"].append("ROOT" if dep == "root" else dep) - sent["spaces"].append(space_after == "_") - sent["entities"] = ["-"] * len(sent["words"]) # TODO: doc-level format - sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"]) - if oracle_segments: - docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"])) - golds.append(sent) - assert golds[-1]["morphs"] is not None - - sent_annots.append(sent) - if raw_text and max_doc_length and len(sent_annots) >= max_doc_length: - doc, gold = _make_gold(nlp, None, sent_annots) - assert gold["morphs"] is not None - sent_annots = [] - docs.append(doc) - golds.append(gold) - if limit and len(docs) >= limit: - return golds_to_gold_data(docs, golds) - - if raw_text and sent_annots: - doc, gold = _make_gold(nlp, None, sent_annots) - docs.append(doc) - golds.append(gold) - if limit and len(docs) >= limit: - return golds_to_gold_data(docs, golds) - return golds_to_gold_data(docs, golds) - - -def _compile_morph_string(morph_string, pos): - if morph_string == '_': - return f"POS={pos}" - return morph_string + f"|POS={pos}" - - -def read_conllu(file_): - docs = [] - sent = [] - doc = [] - for line in file_: - if line.startswith("# newdoc"): - if doc: - docs.append(doc) - doc = [] - elif line.startswith("#"): - continue - elif not line.strip(): - if sent: - doc.append(sent) - sent = [] - else: - sent.append(list(line.strip().split("\t"))) - if len(sent[-1]) != 10: - print(repr(line)) - raise ValueError - if sent: - doc.append(sent) - if doc: - docs.append(doc) - return docs - - -def _make_gold(nlp, text, sent_annots, drop_deps=0.0): - # Flatten the conll annotations, and adjust the head indices - gold = defaultdict(list) - sent_starts = [] - for sent in sent_annots: - gold["heads"].extend(len(gold["words"])+head for head in sent["heads"]) - for field in ["words", "tags", "deps", "morphs", "entities", "spaces"]: - gold[field].extend(sent[field]) - sent_starts.append(True) - sent_starts.extend([False] * (len(sent["words"]) - 1)) - # Construct text if necessary - assert len(gold["words"]) == len(gold["spaces"]) - if text is None: - text = "".join( - word + " " * space for word, space in zip(gold["words"], gold["spaces"]) - ) - doc = nlp.make_doc(text) - gold.pop("spaces") - gold["sent_starts"] = sent_starts - for i in range(len(gold["heads"])): - if random.random() < drop_deps: - gold["heads"][i] = None - gold["labels"][i] = None - - return doc, gold - - -############################# -# Data transforms for spaCy # -############################# - - -def golds_to_gold_data(docs, golds): - """Get out the training data format used by begin_training""" - data = [] - for doc, gold in zip(docs, golds): - example = Example.from_dict(doc, dict(gold)) - data.append(example) - return data - - -############## -# Evaluation # -############## - - -def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): - if text_loc.parts[-1].endswith(".conllu"): - docs = [] - with text_loc.open(encoding="utf8") as file_: - for conllu_doc in read_conllu(file_): - for conllu_sent in conllu_doc: - words = [line[1] for line in conllu_sent] - docs.append(Doc(nlp.vocab, words=words)) - for name, component in nlp.pipeline: - docs = list(component.pipe(docs)) - else: - with text_loc.open("r", encoding="utf8") as text_file: - texts = split_text(text_file.read()) - docs = list(nlp.pipe(texts)) - with sys_loc.open("w", encoding="utf8") as out_file: - write_conllu(docs, out_file) - with gold_loc.open("r", encoding="utf8") as gold_file: - gold_ud = conll17_ud_eval.load_conllu(gold_file) - with sys_loc.open("r", encoding="utf8") as sys_file: - sys_ud = conll17_ud_eval.load_conllu(sys_file) - scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) - return docs, scores - - -def write_conllu(docs, file_): - if not Token.has_extension("get_conllu_lines"): - Token.set_extension("get_conllu_lines", method=get_token_conllu) - if not Token.has_extension("begins_fused"): - Token.set_extension("begins_fused", default=False) - if not Token.has_extension("inside_fused"): - Token.set_extension("inside_fused", default=False) - - merger = Matcher(docs[0].vocab) - merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) - for i, doc in enumerate(docs): - matches = [] - if doc.is_parsed: - matches = merger(doc) - spans = [doc[start : end + 1] for _, start, end in matches] - seen_tokens = set() - with doc.retokenize() as retokenizer: - for span in spans: - span_tokens = set(range(span.start, span.end)) - if not span_tokens.intersection(seen_tokens): - retokenizer.merge(span) - seen_tokens.update(span_tokens) - - file_.write("# newdoc id = {i}\n".format(i=i)) - for j, sent in enumerate(doc.sents): - file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) - file_.write("# text = {text}\n".format(text=sent.text)) - for k, token in enumerate(sent): - if token.head.i > sent[-1].i or token.head.i < sent[0].i: - for word in doc[sent[0].i - 10 : sent[0].i]: - print(word.i, word.head.i, word.text, word.dep_) - for word in sent: - print(word.i, word.head.i, word.text, word.dep_) - for word in doc[sent[-1].i : sent[-1].i + 10]: - print(word.i, word.head.i, word.text, word.dep_) - raise ValueError( - "Invalid parse: head outside sentence (%s)" % token.text - ) - file_.write(token._.get_conllu_lines(k) + "\n") - file_.write("\n") - - -def print_progress(itn, losses, ud_scores): - fields = { - "dep_loss": losses.get("parser", 0.0), - "morph_loss": losses.get("morphologizer", 0.0), - "tag_loss": losses.get("tagger", 0.0), - "words": ud_scores["Words"].f1 * 100, - "sents": ud_scores["Sentences"].f1 * 100, - "tags": ud_scores["XPOS"].f1 * 100, - "uas": ud_scores["UAS"].f1 * 100, - "las": ud_scores["LAS"].f1 * 100, - "morph": ud_scores["Feats"].f1 * 100, - } - header = ["Epoch", "P.Loss", "M.Loss", "LAS", "UAS", "TAG", "MORPH", "SENT", "WORD"] - if itn == 0: - print("\t".join(header)) - tpl = "\t".join(( - "{:d}", - "{dep_loss:.1f}", - "{morph_loss:.1f}", - "{las:.1f}", - "{uas:.1f}", - "{tags:.1f}", - "{morph:.1f}", - "{sents:.1f}", - "{words:.1f}", - )) - print(tpl.format(itn, **fields)) - - -# def get_sent_conllu(sent, sent_id): -# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)] - - -def get_token_conllu(token, i): - if token._.begins_fused: - n = 1 - while token.nbor(n)._.inside_fused: - n += 1 - id_ = "%d-%d" % (i, i + n) - lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"] - else: - lines = [] - if token.head.i == token.i: - head = 0 - else: - head = i + (token.head.i - token.i) + 1 - features = list(token.morph) - feat_str = [] - replacements = {"one": "1", "two": "2", "three": "3"} - for feat in features: - if "=" in feat: - feat_str.append(feat) - elif not feat.startswith("begin") and not feat.startswith("end"): - key, value = feat.split("_", 1) - value = replacements.get(value, value) - feat_str.append("%s=%s" % (key, value.title())) - if not feat_str: - feat_str = "_" - else: - feat_str = "|".join(feat_str) - fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, feat_str, - str(head), token.dep_.lower(), "_", "_"] - lines.append("\t".join(fields)) - return "\n".join(lines) - - -################## -# Initialization # -################## - - -def load_nlp(corpus, config, vectors=None): - lang = corpus.split("_")[0] - nlp = spacy.blank(lang) - if config.vectors: - if not vectors: - raise ValueError( - "config asks for vectors, but no vectors " - "directory set on command line (use -v)" - ) - if (Path(vectors) / corpus).exists(): - nlp.vocab.from_disk(Path(vectors) / corpus / "vocab") - nlp.meta["treebank"] = corpus - return nlp - - -def initialize_pipeline(nlp, examples, config, device): - nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False})) - nlp.add_pipe(nlp.create_pipe("morphologizer")) - nlp.add_pipe(nlp.create_pipe("parser")) - if config.multitask_tag: - nlp.parser.add_multitask_objective("tag") - if config.multitask_sent: - nlp.parser.add_multitask_objective("sent_start") - for eg in examples: - for tag in eg.get_aligned("TAG", as_string=True): - if tag is not None: - nlp.tagger.add_label(tag) - if torch is not None and device != -1: - torch.set_default_tensor_type("torch.cuda.FloatTensor") - optimizer = nlp.begin_training( - lambda: examples, - device=device, - subword_features=config.subword_features, - conv_depth=config.conv_depth, - bilstm_depth=config.bilstm_depth, - ) - if config.pretrained_tok2vec: - _load_pretrained_tok2vec(nlp, config.pretrained_tok2vec) - return optimizer - - -def _load_pretrained_tok2vec(nlp, loc): - """Load pretrained weights for the 'token-to-vector' part of the component - models, which is typically a CNN. See 'spacy pretrain'. Experimental. - """ - with Path(loc).open("rb", encoding="utf8") as file_: - weights_data = file_.read() - loaded = [] - for name, component in nlp.pipeline: - if hasattr(component, "model") and component.model.has_ref("tok2vec"): - component.get_ref("tok2vec").from_bytes(weights_data) - loaded.append(name) - return loaded - - -######################## -# Command line helpers # -######################## - - -class Config(object): - def __init__( - self, - vectors=None, - max_doc_length=10, - multitask_tag=False, - multitask_sent=False, - multitask_dep=False, - multitask_vectors=None, - bilstm_depth=0, - nr_epoch=30, - min_batch_size=100, - max_batch_size=1000, - batch_by_words=True, - dropout=0.2, - conv_depth=4, - subword_features=True, - vectors_dir=None, - pretrained_tok2vec=None, - ): - if vectors_dir is not None: - if vectors is None: - vectors = True - if multitask_vectors is None: - multitask_vectors = True - for key, value in locals().items(): - setattr(self, key, value) - - @classmethod - def load(cls, loc, vectors_dir=None): - with Path(loc).open("r", encoding="utf8") as file_: - cfg = json.load(file_) - if vectors_dir is not None: - cfg["vectors_dir"] = vectors_dir - return cls(**cfg) - - -class Dataset(object): - def __init__(self, path, section): - self.path = path - self.section = section - self.conllu = None - self.text = None - for file_path in self.path.iterdir(): - name = file_path.parts[-1] - if section in name and name.endswith("conllu"): - self.conllu = file_path - elif section in name and name.endswith("txt"): - self.text = file_path - if self.conllu is None: - msg = "Could not find .txt file in {path} for {section}" - raise IOError(msg.format(section=section, path=path)) - if self.text is None: - msg = "Could not find .txt file in {path} for {section}" - self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0] - - -class TreebankPaths(object): - def __init__(self, ud_path, treebank, **cfg): - self.train = Dataset(ud_path / treebank, "train") - self.dev = Dataset(ud_path / treebank, "dev") - self.lang = self.train.lang - - -@plac.annotations( - ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), - parses_dir=("Directory to write the development parses", "positional", None, Path), - corpus=( - "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora", - "positional", - None, - str, - ), - config=("Path to json formatted config file", "option", "C", Path), - limit=("Size limit", "option", "n", int), - gpu_device=("Use GPU", "option", "g", int), - use_oracle_segments=("Use oracle segments", "flag", "G", int), - vectors_dir=( - "Path to directory with pretrained vectors, named e.g. en/", - "option", - "v", - Path, - ), -) -def main( - ud_dir, - parses_dir, - corpus, - config=None, - limit=0, - gpu_device=-1, - vectors_dir=None, - use_oracle_segments=False, -): - Token.set_extension("get_conllu_lines", method=get_token_conllu) - Token.set_extension("begins_fused", default=False) - Token.set_extension("inside_fused", default=False) - - spacy.util.fix_random_seed() - lang.zh.Chinese.Defaults.use_jieba = False - lang.ja.Japanese.Defaults.use_janome = False - - if config is not None: - config = Config.load(config, vectors_dir=vectors_dir) - else: - config = Config(vectors_dir=vectors_dir) - paths = TreebankPaths(ud_dir, corpus) - if not (parses_dir / corpus).exists(): - (parses_dir / corpus).mkdir() - print("Train and evaluate", corpus, "using lang", paths.lang) - nlp = load_nlp(paths.lang, config, vectors=vectors_dir) - - examples = read_data( - nlp, - paths.train.conllu.open(encoding="utf8"), - paths.train.text.open(encoding="utf8"), - max_doc_length=config.max_doc_length, - limit=limit, - ) - - optimizer = initialize_pipeline(nlp, examples, config, gpu_device) - - batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001) - beam_prob = compounding(0.2, 0.8, 1.001) - for i in range(config.nr_epoch): - examples = read_data( - nlp, - paths.train.conllu.open(encoding="utf8"), - paths.train.text.open(encoding="utf8"), - max_doc_length=config.max_doc_length, - limit=limit, - oracle_segments=use_oracle_segments, - raw_text=not use_oracle_segments, - ) - random.shuffle(examples) - if config.batch_by_words: - batches = minibatch_by_words(examples, size=batch_sizes) - else: - batches = minibatch(examples, size=batch_sizes) - losses = {} - n_train_words = sum(len(eg.predicted) for eg in examples) - with tqdm.tqdm(total=n_train_words, leave=False) as pbar: - for batch in batches: - pbar.update(sum(len(ex.predicted) for ex in batch)) - nlp.parser.cfg["beam_update_prob"] = next(beam_prob) - nlp.update( - batch, - sgd=optimizer, - drop=config.dropout, - losses=losses, - ) - - out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i) - with nlp.use_params(optimizer.averages): - if use_oracle_segments: - parsed_docs, scores = evaluate(nlp, paths.dev.conllu, - paths.dev.conllu, out_path) - else: - parsed_docs, scores = evaluate(nlp, paths.dev.text, - paths.dev.conllu, out_path) - print_progress(i, losses, scores) - - -def _render_parses(i, to_render): - to_render[0].user_data["title"] = "Batch %d" % i - with Path("/tmp/parses.html").open("w", encoding="utf8") as file_: - html = displacy.render(to_render[:5], style="dep", page=True) - file_.write(html) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/README.md b/examples/README.md deleted file mode 100644 index 869077531..000000000 --- a/examples/README.md +++ /dev/null @@ -1,19 +0,0 @@ - - -# spaCy examples - -The examples are Python scripts with well-behaved command line interfaces. For -more detailed usage guides, see the [documentation](https://spacy.io/usage/). - -To see the available arguments, you can use the `--help` or `-h` flag: - -```bash -$ python examples/training/train_ner.py --help -``` - -While we try to keep the examples up to date, they are not currently exercised -by the test suite, as some of them require significant data downloads or take -time to train. If you find that an example is no longer running, -[please tell us](https://github.com/explosion/spaCy/issues)! We know there's -nothing worse than trying to figure out what you're doing wrong, and it turns -out your code was never the problem. diff --git a/examples/deep_learning_keras.py b/examples/deep_learning_keras.py deleted file mode 100644 index bf857b8b7..000000000 --- a/examples/deep_learning_keras.py +++ /dev/null @@ -1,266 +0,0 @@ -""" -This example shows how to use an LSTM sentiment classification model trained -using Keras in spaCy. spaCy splits the document into sentences, and each -sentence is classified using the LSTM. The scores for the sentences are then -aggregated to give the document score. This kind of hierarchical model is quite -difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras -example on this dataset performs quite poorly, because it cuts off the documents -so that they're a fixed size. This hurts review accuracy a lot, because people -often summarise their rating in the final sentence - -Prerequisites: -spacy download en_vectors_web_lg -pip install keras==2.0.9 - -Compatible with: spaCy v2.0.0+ -""" -import ml_datasets -import plac -import random -import pathlib -import cytoolz -import numpy -from keras.models import Sequential, model_from_json -from keras.layers import LSTM, Dense, Embedding, Bidirectional -from keras.layers import TimeDistributed -from keras.optimizers import Adam -from spacy.compat import pickle -import spacy - - -class SentimentAnalyser(object): - @classmethod - def load(cls, path, nlp, max_length=100): - with (path / "config.json").open() as file_: - model = model_from_json(file_.read()) - with (path / "model").open("rb") as file_: - lstm_weights = pickle.load(file_) - embeddings = get_embeddings(nlp.vocab) - model.set_weights([embeddings] + lstm_weights) - return cls(model, max_length=max_length) - - def __init__(self, model, max_length=100): - self._model = model - self.max_length = max_length - - def __call__(self, doc): - X = get_features([doc], self.max_length) - y = self._model.predict(X) - self.set_sentiment(doc, y) - - def pipe(self, docs, batch_size=1000): - for minibatch in cytoolz.partition_all(batch_size, docs): - minibatch = list(minibatch) - sentences = [] - for doc in minibatch: - sentences.extend(doc.sents) - Xs = get_features(sentences, self.max_length) - ys = self._model.predict(Xs) - for sent, label in zip(sentences, ys): - sent.doc.sentiment += label - 0.5 - for doc in minibatch: - yield doc - - def set_sentiment(self, doc, y): - doc.sentiment = float(y[0]) - # Sentiment has a native slot for a single float. - # For arbitrary data storage, there's: - # doc.user_data['my_data'] = y - - -def get_labelled_sentences(docs, doc_labels): - labels = [] - sentences = [] - for doc, y in zip(docs, doc_labels): - for sent in doc.sents: - sentences.append(sent) - labels.append(y) - return sentences, numpy.asarray(labels, dtype="int32") - - -def get_features(docs, max_length): - docs = list(docs) - Xs = numpy.zeros((len(docs), max_length), dtype="int32") - for i, doc in enumerate(docs): - j = 0 - for token in doc: - vector_id = token.vocab.vectors.find(key=token.orth) - if vector_id >= 0: - Xs[i, j] = vector_id - else: - Xs[i, j] = 0 - j += 1 - if j >= max_length: - break - return Xs - - -def train( - train_texts, - train_labels, - dev_texts, - dev_labels, - lstm_shape, - lstm_settings, - lstm_optimizer, - batch_size=100, - nb_epoch=5, - by_sentence=True, -): - - print("Loading spaCy") - nlp = spacy.load("en_vectors_web_lg") - nlp.add_pipe(nlp.create_pipe("sentencizer")) - embeddings = get_embeddings(nlp.vocab) - model = compile_lstm(embeddings, lstm_shape, lstm_settings) - - print("Parsing texts...") - train_docs = list(nlp.pipe(train_texts)) - dev_docs = list(nlp.pipe(dev_texts)) - if by_sentence: - train_docs, train_labels = get_labelled_sentences(train_docs, train_labels) - dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels) - - train_X = get_features(train_docs, lstm_shape["max_length"]) - dev_X = get_features(dev_docs, lstm_shape["max_length"]) - model.fit( - train_X, - train_labels, - validation_data=(dev_X, dev_labels), - epochs=nb_epoch, - batch_size=batch_size, - ) - return model - - -def compile_lstm(embeddings, shape, settings): - model = Sequential() - model.add( - Embedding( - embeddings.shape[0], - embeddings.shape[1], - input_length=shape["max_length"], - trainable=False, - weights=[embeddings], - mask_zero=True, - ) - ) - model.add(TimeDistributed(Dense(shape["nr_hidden"], use_bias=False))) - model.add( - Bidirectional( - LSTM( - shape["nr_hidden"], - recurrent_dropout=settings["dropout"], - dropout=settings["dropout"], - ) - ) - ) - model.add(Dense(shape["nr_class"], activation="sigmoid")) - model.compile( - optimizer=Adam(lr=settings["lr"]), - loss="binary_crossentropy", - metrics=["accuracy"], - ) - return model - - -def get_embeddings(vocab): - return vocab.vectors.data - - -def evaluate(model_dir, texts, labels, max_length=100): - nlp = spacy.load("en_vectors_web_lg") - nlp.add_pipe(nlp.create_pipe("sentencizer")) - nlp.add_pipe(SentimentAnalyser.load(model_dir, nlp, max_length=max_length)) - - correct = 0 - i = 0 - for doc in nlp.pipe(texts, batch_size=1000): - correct += bool(doc.sentiment >= 0.5) == bool(labels[i]) - i += 1 - return float(correct) / i - - -def read_data(data_dir, limit=0): - examples = [] - for subdir, label in (("pos", 1), ("neg", 0)): - for filename in (data_dir / subdir).iterdir(): - with filename.open() as file_: - text = file_.read() - examples.append((text, label)) - random.shuffle(examples) - if limit >= 1: - examples = examples[:limit] - return zip(*examples) # Unzips into two lists - - -@plac.annotations( - train_dir=("Location of training file or directory"), - dev_dir=("Location of development file or directory"), - model_dir=("Location of output model directory",), - is_runtime=("Demonstrate run-time usage", "flag", "r", bool), - nr_hidden=("Number of hidden units", "option", "H", int), - max_length=("Maximum sentence length", "option", "L", int), - dropout=("Dropout", "option", "d", float), - learn_rate=("Learn rate", "option", "e", float), - nb_epoch=("Number of training epochs", "option", "i", int), - batch_size=("Size of minibatches for training LSTM", "option", "b", int), - nr_examples=("Limit to N examples", "option", "n", int), -) -def main( - model_dir=None, - train_dir=None, - dev_dir=None, - is_runtime=False, - nr_hidden=64, - max_length=100, # Shape - dropout=0.5, - learn_rate=0.001, # General NN config - nb_epoch=5, - batch_size=256, - nr_examples=-1, -): # Training params - if model_dir is not None: - model_dir = pathlib.Path(model_dir) - if train_dir is None or dev_dir is None: - imdb_data = ml_datasets.imdb() - if is_runtime: - if dev_dir is None: - dev_texts, dev_labels = zip(*imdb_data[1]) - else: - dev_texts, dev_labels = read_data(dev_dir) - acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) - print(acc) - else: - if train_dir is None: - train_texts, train_labels = zip(*imdb_data[0]) - else: - print("Read data") - train_texts, train_labels = read_data(train_dir, limit=nr_examples) - if dev_dir is None: - dev_texts, dev_labels = zip(*imdb_data[1]) - else: - dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples) - train_labels = numpy.asarray(train_labels, dtype="int32") - dev_labels = numpy.asarray(dev_labels, dtype="int32") - lstm = train( - train_texts, - train_labels, - dev_texts, - dev_labels, - {"nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1}, - {"dropout": dropout, "lr": learn_rate}, - {}, - nb_epoch=nb_epoch, - batch_size=batch_size, - ) - weights = lstm.get_weights() - if model_dir is not None: - with (model_dir / "model").open("wb") as file_: - pickle.dump(weights[1:], file_) - with (model_dir / "config.json").open("w") as file_: - file_.write(lstm.to_json()) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/information_extraction/entity_relations.py b/examples/information_extraction/entity_relations.py deleted file mode 100644 index c40a3c10d..000000000 --- a/examples/information_extraction/entity_relations.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""A simple example of extracting relations between phrases and entities using -spaCy's named entity recognizer and the dependency parse. Here, we extract -money and currency values (entities labelled as MONEY) and then check the -dependency tree to find the noun phrase they are referring to – for example: -$9.4 million --> Net income. - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.2.1 -""" -from __future__ import unicode_literals, print_function - -import plac -import spacy - - -TEXTS = [ - "Net income was $9.4 million compared to the prior year of $2.7 million.", - "Revenue exceeded twelve billion dollars, with a loss of $1b.", -] - - -@plac.annotations( - model=("Model to load (needs parser and NER)", "positional", None, str) -) -def main(model="en_core_web_sm"): - nlp = spacy.load(model) - print("Loaded model '%s'" % model) - print("Processing %d texts" % len(TEXTS)) - - for text in TEXTS: - doc = nlp(text) - relations = extract_currency_relations(doc) - for r1, r2 in relations: - print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text)) - - -def filter_spans(spans): - # Filter a sequence of spans so they don't contain overlaps - # For spaCy 2.1.4+: this function is available as spacy.util.filter_spans() - get_sort_key = lambda span: (span.end - span.start, -span.start) - sorted_spans = sorted(spans, key=get_sort_key, reverse=True) - result = [] - seen_tokens = set() - for span in sorted_spans: - # Check for end - 1 here because boundaries are inclusive - if span.start not in seen_tokens and span.end - 1 not in seen_tokens: - result.append(span) - seen_tokens.update(range(span.start, span.end)) - result = sorted(result, key=lambda span: span.start) - return result - - -def extract_currency_relations(doc): - # Merge entities and noun chunks into one token - spans = list(doc.ents) + list(doc.noun_chunks) - spans = filter_spans(spans) - with doc.retokenize() as retokenizer: - for span in spans: - retokenizer.merge(span) - - relations = [] - for money in filter(lambda w: w.ent_type_ == "MONEY", doc): - if money.dep_ in ("attr", "dobj"): - subject = [w for w in money.head.lefts if w.dep_ == "nsubj"] - if subject: - subject = subject[0] - relations.append((subject, money)) - elif money.dep_ == "pobj" and money.head.dep_ == "prep": - relations.append((money.head.head, money)) - return relations - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # Net income MONEY $9.4 million - # the prior year MONEY $2.7 million - # Revenue MONEY twelve billion dollars - # a loss MONEY 1b diff --git a/examples/information_extraction/parse_subtrees.py b/examples/information_extraction/parse_subtrees.py deleted file mode 100644 index 2ca9da1ea..000000000 --- a/examples/information_extraction/parse_subtrees.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""This example shows how to navigate the parse tree including subtrees -attached to a word. - -Based on issue #252: -"In the documents and tutorials the main thing I haven't found is -examples on how to break sentences down into small sub thoughts/chunks. The -noun_chunks is handy, but having examples on using the token.head to find small -(near-complete) sentence chunks would be neat. Lets take the example sentence: -"displaCy uses CSS and JavaScript to show you how computers understand language" - -This sentence has two main parts (XCOMP & CCOMP) according to the breakdown: -[displaCy] uses CSS and Javascript [to + show] -show you how computers understand [language] - -I'm assuming that we can use the token.head to build these groups." - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -from __future__ import unicode_literals, print_function - -import plac -import spacy - - -@plac.annotations(model=("Model to load", "positional", None, str)) -def main(model="en_core_web_sm"): - nlp = spacy.load(model) - print("Loaded model '%s'" % model) - - doc = nlp( - "displaCy uses CSS and JavaScript to show you how computers " - "understand language" - ) - - # The easiest way is to find the head of the subtree you want, and then use - # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` - # is the one that does what you're asking for most directly: - for word in doc: - if word.dep_ in ("xcomp", "ccomp"): - print("".join(w.text_with_ws for w in word.subtree)) - - # It'd probably be better for `word.subtree` to return a `Span` object - # instead of a generator over the tokens. If you want the `Span` you can - # get it via the `.right_edge` and `.left_edge` properties. The `Span` - # object is nice because you can easily get a vector, merge it, etc. - for word in doc: - if word.dep_ in ("xcomp", "ccomp"): - subtree_span = doc[word.left_edge.i : word.right_edge.i + 1] - print(subtree_span.text, "|", subtree_span.root.text) - - # You might also want to select a head, and then select a start and end - # position by walking along its children. You could then take the - # `.left_edge` and `.right_edge` of those tokens, and use it to calculate - # a span. - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # to show you how computers understand language - # how computers understand language - # to show you how computers understand language | show - # how computers understand language | understand diff --git a/examples/information_extraction/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py deleted file mode 100644 index f3622bfdd..000000000 --- a/examples/information_extraction/phrase_matcher.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Match a large set of multi-word expressions in O(1) time. - -The idea is to associate each word in the vocabulary with a tag, noting whether -they begin, end, or are inside at least one pattern. An additional tag is used -for single-word patterns. Complete patterns are also stored in a hash set. -When we process a document, we look up the words in the vocabulary, to -associate the words with the tags. We then search for tag-sequences that -correspond to valid candidates. Finally, we look up the candidates in the hash -set. - -For instance, to search for the phrases "Barack Hussein Obama" and "Hilary -Clinton", we would associate "Barack" and "Hilary" with the B tag, Hussein with -the I tag, and Obama and Clinton with the L tag. - -The document "Barack Clinton and Hilary Clinton" would have the tag sequence -[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second -candidate is in the phrase dictionary, so only one is returned as a match. - -The algorithm is O(n) at run-time for document of length n because we're only -ever matching over the tag patterns. So no matter how many phrases we're -looking for, our pattern set stays very small (exact size depends on the -maximum length we're looking for, as the query language currently has no -quantifiers). - -The example expects a .bz2 file from the Reddit corpus, and a patterns file, -formatted in jsonl as a sequence of entries like this: - -{"text":"Anchorage"} -{"text":"Angola"} -{"text":"Ann Arbor"} -{"text":"Annapolis"} -{"text":"Appalachia"} -{"text":"Argentina"} - -Reddit comments corpus: -* https://files.pushshift.io/reddit/ -* https://archive.org/details/2015_reddit_comments_corpus - -Compatible with: spaCy v2.0.0+ -""" -from __future__ import print_function, unicode_literals, division - -from bz2 import BZ2File -import time -import plac -import json - -from spacy.matcher import PhraseMatcher -import spacy - - -@plac.annotations( - patterns_loc=("Path to gazetteer", "positional", None, str), - text_loc=("Path to Reddit corpus file", "positional", None, str), - n=("Number of texts to read", "option", "n", int), - lang=("Language class to initialise", "option", "l", str), -) -def main(patterns_loc, text_loc, n=10000, lang="en"): - nlp = spacy.blank(lang) - nlp.vocab.lex_attr_getters = {} - phrases = read_gazetteer(nlp.tokenizer, patterns_loc) - count = 0 - t1 = time.time() - for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)): - count += 1 - t2 = time.time() - print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count)) - - -def read_gazetteer(tokenizer, loc, n=-1): - for i, line in enumerate(open(loc)): - data = json.loads(line.strip()) - phrase = tokenizer(data["text"]) - for w in phrase: - _ = tokenizer.vocab[w.text] - if len(phrase) >= 2: - yield phrase - - -def read_text(bz2_loc, n=10000): - with BZ2File(bz2_loc) as file_: - for i, line in enumerate(file_): - data = json.loads(line) - yield data["body"] - if i >= n: - break - - -def get_matches(tokenizer, phrases, texts): - matcher = PhraseMatcher(tokenizer.vocab) - matcher.add("Phrase", None, *phrases) - for text in texts: - doc = tokenizer(text) - for w in doc: - _ = doc.vocab[w.text] - matches = matcher(doc) - for ent_id, start, end in matches: - yield (ent_id, doc[start:end].text) - - -if __name__ == "__main__": - if False: - import cProfile - import pstats - - cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") - s = pstats.Stats("Profile.prof") - s.strip_dirs().sort_stats("time").print_stats() - else: - plac.call(main) diff --git a/examples/keras_parikh_entailment/README.md b/examples/keras_parikh_entailment/README.md deleted file mode 100644 index 86ba50d9b..000000000 --- a/examples/keras_parikh_entailment/README.md +++ /dev/null @@ -1,114 +0,0 @@ - - -# A decomposable attention model for Natural Language Inference -**by Matthew Honnibal, [@honnibal](https://github.com/honnibal)** -**Updated for spaCy 2.0+ and Keras 2.2.2+ by John Stewart, [@free-variation](https://github.com/free-variation)** - -This directory contains an implementation of the entailment prediction model described -by [Parikh et al. (2016)](https://arxiv.org/pdf/1606.01933.pdf). The model is notable -for its competitive performance with very few parameters. - -The model is implemented using [Keras](https://keras.io/) and [spaCy](https://spacy.io). -Keras is used to build and train the network. spaCy is used to load -the [GloVe](http://nlp.stanford.edu/projects/glove/) vectors, perform the -feature extraction, and help you apply the model at run-time. The following -demo code shows how the entailment model can be used at runtime, once the -hook is installed to customise the `.similarity()` method of spaCy's `Doc` -and `Span` objects: - -```python -def demo(shape): - nlp = spacy.load('en_vectors_web_lg') - nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0])) - - doc1 = nlp(u'The king of France is bald.') - doc2 = nlp(u'France has no king.') - - print("Sentence 1:", doc1) - print("Sentence 2:", doc2) - - entailment_type, confidence = doc1.similarity(doc2) - print("Entailment type:", entailment_type, "(Confidence:", confidence, ")") -``` - -Which gives the output `Entailment type: contradiction (Confidence: 0.60604566)`, showing that -the system has definite opinions about Betrand Russell's [famous conundrum](https://users.drew.edu/jlenz/br-on-denoting.html)! - -I'm working on a blog post to explain Parikh et al.'s model in more detail. -A [notebook](https://github.com/free-variation/spaCy/blob/master/examples/notebooks/Decompositional%20Attention.ipynb) is available that briefly explains this implementation. -I think it is a very interesting example of the attention mechanism, which -I didn't understand very well before working through this paper. There are -lots of ways to extend the model. - -## What's where - -| File | Description | -| --- | --- | -| `__main__.py` | The script that will be executed. Defines the CLI, the data reading, etc — all the boring stuff. | -| `spacy_hook.py` | Provides a class `KerasSimilarityShim` that lets you use an arbitrary function to customize spaCy's `doc.similarity()` method. Instead of the default average-of-vectors algorithm, when you call `doc1.similarity(doc2)`, you'll get the result of `your_model(doc1, doc2)`. | -| `keras_decomposable_attention.py` | Defines the neural network model. | - -## Setting up - -First, install [Keras](https://keras.io/), [spaCy](https://spacy.io) and the spaCy -English models (about 1GB of data): - -```bash -pip install keras -pip install spacy -python -m spacy download en_vectors_web_lg -``` - -You'll also want to get Keras working on your GPU, and you will need a backend, such as TensorFlow or Theano. -This will depend on your set up, so you're mostly on your own for this step. If you're using AWS, try the -[NVidia AMI](https://aws.amazon.com/marketplace/pp/B00FYCDDTE). It made things pretty easy. - -Once you've installed the dependencies, you can run a small preliminary test of -the Keras model: - -```bash -py.test keras_parikh_entailment/keras_decomposable_attention.py -``` - -This compiles the model and fits it with some dummy data. You should see that -both tests passed. - -Finally, download the [Stanford Natural Language Inference corpus](http://nlp.stanford.edu/projects/snli/). - -## Running the example - -You can run the `keras_parikh_entailment/` directory as a script, which executes the file -[`keras_parikh_entailment/__main__.py`](__main__.py). If you run the script without arguments -the usage is shown. Running it with `-h` explains the command line arguments. - -The first thing you'll want to do is train the model: - -```bash -python keras_parikh_entailment/ train -t -s -``` - -Training takes about 300 epochs for full accuracy, and I haven't rerun the full -experiment since refactoring things to publish this example — please let me -know if I've broken something. You should get to at least 85% on the development data even after 10-15 epochs. - -The other two modes demonstrate run-time usage. I never like relying on the accuracy printed -by `.fit()` methods. I never really feel confident until I've run a new process that loads -the model and starts making predictions, without access to the gold labels. I've therefore -included an `evaluate` mode. - -```bash -python keras_parikh_entailment/ evaluate -s -``` - -Finally, there's also a little demo, which mostly exists to show -you how run-time usage will eventually look. - -```bash -python keras_parikh_entailment/ demo -``` - -## Getting updates - -We should have the blog post explaining the model ready before the end of the week. To get -notified when it's published, you can either follow me on [Twitter](https://twitter.com/honnibal) -or subscribe to our [mailing list](http://eepurl.com/ckUpQ5). diff --git a/examples/keras_parikh_entailment/__main__.py b/examples/keras_parikh_entailment/__main__.py deleted file mode 100644 index ad398dae3..000000000 --- a/examples/keras_parikh_entailment/__main__.py +++ /dev/null @@ -1,207 +0,0 @@ -import numpy as np -import json -from keras.utils import to_categorical -import plac -import sys - -from keras_decomposable_attention import build_model -from spacy_hook import get_embeddings, KerasSimilarityShim - -try: - import cPickle as pickle -except ImportError: - import pickle - -import spacy - -# workaround for keras/tensorflow bug -# see https://github.com/tensorflow/tensorflow/issues/3388 -import os -import importlib -from keras import backend as K - - -def set_keras_backend(backend): - if K.backend() != backend: - os.environ["KERAS_BACKEND"] = backend - importlib.reload(K) - assert K.backend() == backend - if backend == "tensorflow": - K.get_session().close() - cfg = K.tf.ConfigProto() - cfg.gpu_options.allow_growth = True - K.set_session(K.tf.Session(config=cfg)) - K.clear_session() - - -set_keras_backend("tensorflow") - - -def train(train_loc, dev_loc, shape, settings): - train_texts1, train_texts2, train_labels = read_snli(train_loc) - dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc) - - print("Loading spaCy") - nlp = spacy.load("en_vectors_web_lg") - assert nlp.path is not None - print("Processing texts...") - train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0]) - dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0]) - - print("Compiling network") - model = build_model(get_embeddings(nlp.vocab), shape, settings) - - print(settings) - model.fit( - train_X, - train_labels, - validation_data=(dev_X, dev_labels), - epochs=settings["nr_epoch"], - batch_size=settings["batch_size"], - ) - if not (nlp.path / "similarity").exists(): - (nlp.path / "similarity").mkdir() - print("Saving to", nlp.path / "similarity") - weights = model.get_weights() - # remove the embedding matrix. We can reconstruct it. - del weights[1] - with (nlp.path / "similarity" / "model").open("wb") as file_: - pickle.dump(weights, file_) - with (nlp.path / "similarity" / "config.json").open("w") as file_: - file_.write(model.to_json()) - - -def evaluate(dev_loc, shape): - dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc) - nlp = spacy.load("en_vectors_web_lg") - nlp.add_pipe(KerasSimilarityShim.load(nlp.path / "similarity", nlp, shape[0])) - total = 0.0 - correct = 0.0 - for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels): - doc1 = nlp(text1) - doc2 = nlp(text2) - sim, _ = doc1.similarity(doc2) - if sim == KerasSimilarityShim.entailment_types[label.argmax()]: - correct += 1 - total += 1 - return correct, total - - -def demo(shape): - nlp = spacy.load("en_vectors_web_lg") - nlp.add_pipe(KerasSimilarityShim.load(nlp.path / "similarity", nlp, shape[0])) - - doc1 = nlp("The king of France is bald.") - doc2 = nlp("France has no king.") - - print("Sentence 1:", doc1) - print("Sentence 2:", doc2) - - entailment_type, confidence = doc1.similarity(doc2) - print("Entailment type:", entailment_type, "(Confidence:", confidence, ")") - - -LABELS = {"entailment": 0, "contradiction": 1, "neutral": 2} - - -def read_snli(path): - texts1 = [] - texts2 = [] - labels = [] - with open(path, "r") as file_: - for line in file_: - eg = json.loads(line) - label = eg["gold_label"] - if label == "-": # per Parikh, ignore - SNLI entries - continue - texts1.append(eg["sentence1"]) - texts2.append(eg["sentence2"]) - labels.append(LABELS[label]) - return texts1, texts2, to_categorical(np.asarray(labels, dtype="int32")) - - -def create_dataset(nlp, texts, hypotheses, num_unk, max_length): - sents = texts + hypotheses - sents_as_ids = [] - for sent in sents: - doc = nlp(sent) - word_ids = [] - for i, token in enumerate(doc): - # skip odd spaces from tokenizer - if token.has_vector and token.vector_norm == 0: - continue - - if i > max_length: - break - - if token.has_vector: - word_ids.append(token.rank + num_unk + 1) - else: - # if we don't have a vector, pick an OOV entry - word_ids.append(token.rank % num_unk + 1) - - # there must be a simpler way of generating padded arrays from lists... - word_id_vec = np.zeros((max_length), dtype="int") - clipped_len = min(max_length, len(word_ids)) - word_id_vec[:clipped_len] = word_ids[:clipped_len] - sents_as_ids.append(word_id_vec) - - return [np.array(sents_as_ids[: len(texts)]), np.array(sents_as_ids[len(texts) :])] - - -@plac.annotations( - mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]), - train_loc=("Path to training data", "option", "t", str), - dev_loc=("Path to development or test data", "option", "s", str), - max_length=("Length to truncate sentences", "option", "L", int), - nr_hidden=("Number of hidden units", "option", "H", int), - dropout=("Dropout level", "option", "d", float), - learn_rate=("Learning rate", "option", "r", float), - batch_size=("Batch size for neural network training", "option", "b", int), - nr_epoch=("Number of training epochs", "option", "e", int), - entail_dir=( - "Direction of entailment", - "option", - "D", - str, - ["both", "left", "right"], - ), -) -def main( - mode, - train_loc, - dev_loc, - max_length=50, - nr_hidden=200, - dropout=0.2, - learn_rate=0.001, - batch_size=1024, - nr_epoch=10, - entail_dir="both", -): - shape = (max_length, nr_hidden, 3) - settings = { - "lr": learn_rate, - "dropout": dropout, - "batch_size": batch_size, - "nr_epoch": nr_epoch, - "entail_dir": entail_dir, - } - - if mode == "train": - if train_loc == None or dev_loc == None: - print("Train mode requires paths to training and development data sets.") - sys.exit(1) - train(train_loc, dev_loc, shape, settings) - elif mode == "evaluate": - if dev_loc == None: - print("Evaluate mode requires paths to test data set.") - sys.exit(1) - correct, total = evaluate(dev_loc, shape) - print(correct, "/", total, correct / total) - else: - demo(shape) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/keras_parikh_entailment/keras_decomposable_attention.py b/examples/keras_parikh_entailment/keras_decomposable_attention.py deleted file mode 100644 index 2e17a11ee..000000000 --- a/examples/keras_parikh_entailment/keras_decomposable_attention.py +++ /dev/null @@ -1,152 +0,0 @@ -# Semantic entailment/similarity with decomposable attention (using spaCy and Keras) -# Practical state-of-the-art textual entailment with spaCy and Keras - -import numpy as np -from keras import layers, Model, models, optimizers -from keras import backend as K - - -def build_model(vectors, shape, settings): - max_length, nr_hidden, nr_class = shape - - input1 = layers.Input(shape=(max_length,), dtype="int32", name="words1") - input2 = layers.Input(shape=(max_length,), dtype="int32", name="words2") - - # embeddings (projected) - embed = create_embedding(vectors, max_length, nr_hidden) - - a = embed(input1) - b = embed(input2) - - # step 1: attend - F = create_feedforward(nr_hidden) - att_weights = layers.dot([F(a), F(b)], axes=-1) - - G = create_feedforward(nr_hidden) - - if settings["entail_dir"] == "both": - norm_weights_a = layers.Lambda(normalizer(1))(att_weights) - norm_weights_b = layers.Lambda(normalizer(2))(att_weights) - alpha = layers.dot([norm_weights_a, a], axes=1) - beta = layers.dot([norm_weights_b, b], axes=1) - - # step 2: compare - comp1 = layers.concatenate([a, beta]) - comp2 = layers.concatenate([b, alpha]) - v1 = layers.TimeDistributed(G)(comp1) - v2 = layers.TimeDistributed(G)(comp2) - - # step 3: aggregate - v1_sum = layers.Lambda(sum_word)(v1) - v2_sum = layers.Lambda(sum_word)(v2) - concat = layers.concatenate([v1_sum, v2_sum]) - - elif settings["entail_dir"] == "left": - norm_weights_a = layers.Lambda(normalizer(1))(att_weights) - alpha = layers.dot([norm_weights_a, a], axes=1) - comp2 = layers.concatenate([b, alpha]) - v2 = layers.TimeDistributed(G)(comp2) - v2_sum = layers.Lambda(sum_word)(v2) - concat = v2_sum - - else: - norm_weights_b = layers.Lambda(normalizer(2))(att_weights) - beta = layers.dot([norm_weights_b, b], axes=1) - comp1 = layers.concatenate([a, beta]) - v1 = layers.TimeDistributed(G)(comp1) - v1_sum = layers.Lambda(sum_word)(v1) - concat = v1_sum - - H = create_feedforward(nr_hidden) - out = H(concat) - out = layers.Dense(nr_class, activation="softmax")(out) - - model = Model([input1, input2], out) - - model.compile( - optimizer=optimizers.Adam(lr=settings["lr"]), - loss="categorical_crossentropy", - metrics=["accuracy"], - ) - - return model - - -def create_embedding(vectors, max_length, projected_dim): - return models.Sequential( - [ - layers.Embedding( - vectors.shape[0], - vectors.shape[1], - input_length=max_length, - weights=[vectors], - trainable=False, - ), - layers.TimeDistributed( - layers.Dense(projected_dim, activation=None, use_bias=False) - ), - ] - ) - - -def create_feedforward(num_units=200, activation="relu", dropout_rate=0.2): - return models.Sequential( - [ - layers.Dense(num_units, activation=activation), - layers.Dropout(dropout_rate), - layers.Dense(num_units, activation=activation), - layers.Dropout(dropout_rate), - ] - ) - - -def normalizer(axis): - def _normalize(att_weights): - exp_weights = K.exp(att_weights) - sum_weights = K.sum(exp_weights, axis=axis, keepdims=True) - return exp_weights / sum_weights - - return _normalize - - -def sum_word(x): - return K.sum(x, axis=1) - - -def test_build_model(): - vectors = np.ndarray((100, 8), dtype="float32") - shape = (10, 16, 3) - settings = {"lr": 0.001, "dropout": 0.2, "gru_encode": True, "entail_dir": "both"} - model = build_model(vectors, shape, settings) - - -def test_fit_model(): - def _generate_X(nr_example, length, nr_vector): - X1 = np.ndarray((nr_example, length), dtype="int32") - X1 *= X1 < nr_vector - X1 *= 0 <= X1 - X2 = np.ndarray((nr_example, length), dtype="int32") - X2 *= X2 < nr_vector - X2 *= 0 <= X2 - return [X1, X2] - - def _generate_Y(nr_example, nr_class): - ys = np.zeros((nr_example, nr_class), dtype="int32") - for i in range(nr_example): - ys[i, i % nr_class] = 1 - return ys - - vectors = np.ndarray((100, 8), dtype="float32") - shape = (10, 16, 3) - settings = {"lr": 0.001, "dropout": 0.2, "gru_encode": True, "entail_dir": "both"} - model = build_model(vectors, shape, settings) - - train_X = _generate_X(20, shape[0], vectors.shape[0]) - train_Y = _generate_Y(20, shape[2]) - dev_X = _generate_X(15, shape[0], vectors.shape[0]) - dev_Y = _generate_Y(15, shape[2]) - - model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), epochs=5, batch_size=4) - - -__all__ = [build_model] diff --git a/examples/keras_parikh_entailment/spacy_hook.py b/examples/keras_parikh_entailment/spacy_hook.py deleted file mode 100644 index 307669a70..000000000 --- a/examples/keras_parikh_entailment/spacy_hook.py +++ /dev/null @@ -1,77 +0,0 @@ -import numpy as np -from keras.models import model_from_json - -try: - import cPickle as pickle -except ImportError: - import pickle - - -class KerasSimilarityShim(object): - entailment_types = ["entailment", "contradiction", "neutral"] - - @classmethod - def load(cls, path, nlp, max_length=100, get_features=None): - - if get_features is None: - get_features = get_word_ids - - with (path / "config.json").open() as file_: - model = model_from_json(file_.read()) - with (path / "model").open("rb") as file_: - weights = pickle.load(file_) - - embeddings = get_embeddings(nlp.vocab) - weights.insert(1, embeddings) - model.set_weights(weights) - - return cls(model, get_features=get_features, max_length=max_length) - - def __init__(self, model, get_features=None, max_length=100): - self.model = model - self.get_features = get_features - self.max_length = max_length - - def __call__(self, doc): - doc.user_hooks["similarity"] = self.predict - doc.user_span_hooks["similarity"] = self.predict - - return doc - - def predict(self, doc1, doc2): - x1 = self.get_features([doc1], max_length=self.max_length) - x2 = self.get_features([doc2], max_length=self.max_length) - scores = self.model.predict([x1, x2]) - - return self.entailment_types[scores.argmax()], scores.max() - - -def get_embeddings(vocab, nr_unk=100): - # the extra +1 is for a zero vector representing sentence-final padding - num_vectors = max(lex.rank for lex in vocab) + 2 - - # create random vectors for OOV tokens - oov = np.random.normal(size=(nr_unk, vocab.vectors_length)) - oov = oov / oov.sum(axis=1, keepdims=True) - - vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype="float32") - vectors[1 : (nr_unk + 1),] = oov - for lex in vocab: - if lex.has_vector and lex.vector_norm > 0: - vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm - - return vectors - - -def get_word_ids(docs, max_length=100, nr_unk=100): - Xs = np.zeros((len(docs), max_length), dtype="int32") - - for i, doc in enumerate(docs): - for j, token in enumerate(doc): - if j == max_length: - break - if token.has_vector: - Xs[i, j] = token.rank + nr_unk + 1 - else: - Xs[i, j] = token.rank % nr_unk + 1 - return Xs diff --git a/examples/load_from_docbin.py b/examples/load_from_docbin.py deleted file mode 100644 index f26e7fc49..000000000 --- a/examples/load_from_docbin.py +++ /dev/null @@ -1,45 +0,0 @@ -# coding: utf-8 -""" -Example of loading previously parsed text using spaCy's DocBin class. The example -performs an entity count to show that the annotations are available. -For more details, see https://spacy.io/usage/saving-loading#docs -Installation: -python -m spacy download en_core_web_lg -Usage: -python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy -""" -from __future__ import unicode_literals - -import spacy -from spacy.tokens import DocBin -from timeit import default_timer as timer -from collections import Counter - -EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy" - - -def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH): - nlp = spacy.load(model) - print("Reading data from {}".format(docbin_path)) - with open(docbin_path, "rb") as file_: - bytes_data = file_.read() - nr_word = 0 - start_time = timer() - entities = Counter() - docbin = DocBin().from_bytes(bytes_data) - for doc in docbin.get_docs(nlp.vocab): - nr_word += len(doc) - entities.update((e.label_, e.text) for e in doc.ents) - end_time = timer() - msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)" - wps = nr_word / (end_time - start_time) - print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps)) - print("Most common entities:") - for (label, entity), freq in entities.most_common(30): - print(freq, entity, label) - - -if __name__ == "__main__": - import plac - - plac.call(main) diff --git a/examples/notebooks/Decompositional Attention.ipynb b/examples/notebooks/Decompositional Attention.ipynb deleted file mode 100644 index 8baaf7d33..000000000 --- a/examples/notebooks/Decompositional Attention.ipynb +++ /dev/null @@ -1,955 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Natural language inference using spaCy and Keras" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook details an implementation of the natural language inference model presented in [(Parikh et al, 2016)](https://arxiv.org/abs/1606.01933). The model is notable for the small number of paramaters *and hyperparameters* it specifices, while still yielding good performance." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Constructing the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import spacy\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We only need the GloVe vectors from spaCy, not a full NLP pipeline." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "nlp = spacy.load('en_vectors_web_lg')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Function to load the SNLI dataset. The categories are converted to one-shot representation. The function comes from an example in spaCy." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/jds/tensorflow-gpu/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", - " from ._conv import register_converters as _register_converters\n", - "Using TensorFlow backend.\n" - ] - } - ], - "source": [ - "import json\n", - "from keras.utils import to_categorical\n", - "\n", - "LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n", - "def read_snli(path):\n", - " texts1 = []\n", - " texts2 = []\n", - " labels = []\n", - " with open(path, 'r') as file_:\n", - " for line in file_:\n", - " eg = json.loads(line)\n", - " label = eg['gold_label']\n", - " if label == '-': # per Parikh, ignore - SNLI entries\n", - " continue\n", - " texts1.append(eg['sentence1'])\n", - " texts2.append(eg['sentence2'])\n", - " labels.append(LABELS[label])\n", - " return texts1, texts2, to_categorical(np.asarray(labels, dtype='int32'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Because Keras can do the train/test split for us, we'll load *all* SNLI triples from one file." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "texts,hypotheses,labels = read_snli('snli/snli_1.0_train.jsonl')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def create_dataset(nlp, texts, hypotheses, num_oov, max_length, norm_vectors = True):\n", - " sents = texts + hypotheses\n", - " \n", - " # the extra +1 is for a zero vector represting NULL for padding\n", - " num_vectors = max(lex.rank for lex in nlp.vocab) + 2 \n", - " \n", - " # create random vectors for OOV tokens\n", - " oov = np.random.normal(size=(num_oov, nlp.vocab.vectors_length))\n", - " oov = oov / oov.sum(axis=1, keepdims=True)\n", - " \n", - " vectors = np.zeros((num_vectors + num_oov, nlp.vocab.vectors_length), dtype='float32')\n", - " vectors[num_vectors:, ] = oov\n", - " for lex in nlp.vocab:\n", - " if lex.has_vector and lex.vector_norm > 0:\n", - " vectors[lex.rank + 1] = lex.vector / lex.vector_norm if norm_vectors == True else lex.vector\n", - " \n", - " sents_as_ids = []\n", - " for sent in sents:\n", - " doc = nlp(sent)\n", - " word_ids = []\n", - " \n", - " for i, token in enumerate(doc):\n", - " # skip odd spaces from tokenizer\n", - " if token.has_vector and token.vector_norm == 0:\n", - " continue\n", - " \n", - " if i > max_length:\n", - " break\n", - " \n", - " if token.has_vector:\n", - " word_ids.append(token.rank + 1)\n", - " else:\n", - " # if we don't have a vector, pick an OOV entry\n", - " word_ids.append(token.rank % num_oov + num_vectors) \n", - " \n", - " # there must be a simpler way of generating padded arrays from lists...\n", - " word_id_vec = np.zeros((max_length), dtype='int')\n", - " clipped_len = min(max_length, len(word_ids))\n", - " word_id_vec[:clipped_len] = word_ids[:clipped_len]\n", - " sents_as_ids.append(word_id_vec)\n", - " \n", - " \n", - " return vectors, np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "sem_vectors, text_vectors, hypothesis_vectors = create_dataset(nlp, texts, hypotheses, 100, 50, True)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "texts_test,hypotheses_test,labels_test = read_snli('snli/snli_1.0_test.jsonl')" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "_, text_vectors_test, hypothesis_vectors_test = create_dataset(nlp, texts_test, hypotheses_test, 100, 50, True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We use spaCy to tokenize the sentences and return, when available, a semantic vector for each token. \n", - "\n", - "OOV terms (tokens for which no semantic vector is available) are assigned to one of a set of randomly-generated OOV vectors, per (Parikh et al, 2016).\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that we will clip sentences to 50 words maximum." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from keras import layers, Model, models\n", - "from keras import backend as K" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Building the model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The embedding layer copies the 300-dimensional GloVe vectors into GPU memory. Per (Parikh et al, 2016), the vectors, which are not adapted during training, are projected down to lower-dimensional vectors using a trained projection matrix." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def create_embedding(vectors, max_length, projected_dim):\n", - " return models.Sequential([\n", - " layers.Embedding(\n", - " vectors.shape[0],\n", - " vectors.shape[1],\n", - " input_length=max_length,\n", - " weights=[vectors],\n", - " trainable=False),\n", - " \n", - " layers.TimeDistributed(\n", - " layers.Dense(projected_dim,\n", - " activation=None,\n", - " use_bias=False))\n", - " ])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The Parikh model makes use of three feedforward blocks that construct nonlinear combinations of their input. Each block contains two ReLU layers and two dropout layers." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "def create_feedforward(num_units=200, activation='relu', dropout_rate=0.2):\n", - " return models.Sequential([\n", - " layers.Dense(num_units, activation=activation),\n", - " layers.Dropout(dropout_rate),\n", - " layers.Dense(num_units, activation=activation),\n", - " layers.Dropout(dropout_rate)\n", - " ])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The basic idea of the (Parikh et al, 2016) model is to:\n", - "\n", - "1. *Align*: Construct an alignment of subphrases in the text and hypothesis using an attention-like mechanism, called \"decompositional\" because the layer is applied to each of the two sentences individually rather than to their product. The dot product of the nonlinear transformations of the inputs is then normalized vertically and horizontally to yield a pair of \"soft\" alignment structures, from text->hypothesis and hypothesis->text. Concretely, for each word in one sentence, a multinomial distribution is computed over the words of the other sentence, by learning a multinomial logistic with softmax target.\n", - "2. *Compare*: Each word is now compared to its aligned phrase using a function modeled as a two-layer feedforward ReLU network. The output is a high-dimensional representation of the strength of association between word and aligned phrase.\n", - "3. *Aggregate*: The comparison vectors are summed, separately, for the text and the hypothesis. The result is two vectors: one that describes the degree of association of the text to the hypothesis, and the second, of the hypothesis to the text.\n", - "4. Finally, these two vectors are processed by a dense layer followed by a softmax classifier, as usual.\n", - "\n", - "Note that because in entailment the truth conditions of the consequent must be a subset of those of the antecedent, it is not obvious that we need both vectors in step (3). Entailment is not symmetric. It may be enough to just use the hypothesis->text vector. We will explore this possibility later." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We need a couple of little functions for Lambda layers to normalize and aggregate weights:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "def normalizer(axis):\n", - " def _normalize(att_weights):\n", - " exp_weights = K.exp(att_weights)\n", - " sum_weights = K.sum(exp_weights, axis=axis, keepdims=True)\n", - " return exp_weights/sum_weights\n", - " return _normalize\n", - "\n", - "def sum_word(x):\n", - " return K.sum(x, axis=1)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "def build_model(vectors, max_length, num_hidden, num_classes, projected_dim, entail_dir='both'):\n", - " input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')\n", - " input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')\n", - " \n", - " # embeddings (projected)\n", - " embed = create_embedding(vectors, max_length, projected_dim)\n", - " \n", - " a = embed(input1)\n", - " b = embed(input2)\n", - " \n", - " # step 1: attend\n", - " F = create_feedforward(num_hidden)\n", - " att_weights = layers.dot([F(a), F(b)], axes=-1)\n", - " \n", - " G = create_feedforward(num_hidden)\n", - " \n", - " if entail_dir == 'both':\n", - " norm_weights_a = layers.Lambda(normalizer(1))(att_weights)\n", - " norm_weights_b = layers.Lambda(normalizer(2))(att_weights)\n", - " alpha = layers.dot([norm_weights_a, a], axes=1)\n", - " beta = layers.dot([norm_weights_b, b], axes=1)\n", - "\n", - " # step 2: compare\n", - " comp1 = layers.concatenate([a, beta])\n", - " comp2 = layers.concatenate([b, alpha])\n", - " v1 = layers.TimeDistributed(G)(comp1)\n", - " v2 = layers.TimeDistributed(G)(comp2)\n", - "\n", - " # step 3: aggregate\n", - " v1_sum = layers.Lambda(sum_word)(v1)\n", - " v2_sum = layers.Lambda(sum_word)(v2)\n", - " concat = layers.concatenate([v1_sum, v2_sum])\n", - " elif entail_dir == 'left':\n", - " norm_weights_a = layers.Lambda(normalizer(1))(att_weights)\n", - " alpha = layers.dot([norm_weights_a, a], axes=1)\n", - " comp2 = layers.concatenate([b, alpha])\n", - " v2 = layers.TimeDistributed(G)(comp2)\n", - " v2_sum = layers.Lambda(sum_word)(v2)\n", - " concat = v2_sum\n", - " else:\n", - " norm_weights_b = layers.Lambda(normalizer(2))(att_weights)\n", - " beta = layers.dot([norm_weights_b, b], axes=1)\n", - " comp1 = layers.concatenate([a, beta])\n", - " v1 = layers.TimeDistributed(G)(comp1)\n", - " v1_sum = layers.Lambda(sum_word)(v1)\n", - " concat = v1_sum\n", - " \n", - " H = create_feedforward(num_hidden)\n", - " out = H(concat)\n", - " out = layers.Dense(num_classes, activation='softmax')(out)\n", - " \n", - " model = Model([input1, input2], out)\n", - " \n", - " model.compile(optimizer='adam',\n", - " loss='categorical_crossentropy',\n", - " metrics=['accuracy'])\n", - " return model\n", - " \n", - " \n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "__________________________________________________________________________________________________\n", - "Layer (type) Output Shape Param # Connected to \n", - "==================================================================================================\n", - "words1 (InputLayer) (None, 50) 0 \n", - "__________________________________________________________________________________________________\n", - "words2 (InputLayer) (None, 50) 0 \n", - "__________________________________________________________________________________________________\n", - "sequential_1 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n", - " words2[0][0] \n", - "__________________________________________________________________________________________________\n", - "sequential_2 (Sequential) (None, 50, 200) 80400 sequential_1[1][0] \n", - " sequential_1[2][0] \n", - "__________________________________________________________________________________________________\n", - "dot_1 (Dot) (None, 50, 50) 0 sequential_2[1][0] \n", - " sequential_2[2][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_2 (Lambda) (None, 50, 50) 0 dot_1[0][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_1 (Lambda) (None, 50, 50) 0 dot_1[0][0] \n", - "__________________________________________________________________________________________________\n", - "dot_3 (Dot) (None, 50, 200) 0 lambda_2[0][0] \n", - " sequential_1[2][0] \n", - "__________________________________________________________________________________________________\n", - "dot_2 (Dot) (None, 50, 200) 0 lambda_1[0][0] \n", - " sequential_1[1][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_1 (Concatenate) (None, 50, 400) 0 sequential_1[1][0] \n", - " dot_3[0][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_2 (Concatenate) (None, 50, 400) 0 sequential_1[2][0] \n", - " dot_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "time_distributed_2 (TimeDistrib (None, 50, 200) 120400 concatenate_1[0][0] \n", - "__________________________________________________________________________________________________\n", - "time_distributed_3 (TimeDistrib (None, 50, 200) 120400 concatenate_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_3 (Lambda) (None, 200) 0 time_distributed_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_4 (Lambda) (None, 200) 0 time_distributed_3[0][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_3 (Concatenate) (None, 400) 0 lambda_3[0][0] \n", - " lambda_4[0][0] \n", - "__________________________________________________________________________________________________\n", - "sequential_4 (Sequential) (None, 200) 120400 concatenate_3[0][0] \n", - "__________________________________________________________________________________________________\n", - "dense_8 (Dense) (None, 3) 603 sequential_4[1][0] \n", - "==================================================================================================\n", - "Total params: 321,703,403\n", - "Trainable params: 381,803\n", - "Non-trainable params: 321,321,600\n", - "__________________________________________________________________________________________________\n" - ] - } - ], - "source": [ - "K.clear_session()\n", - "m = build_model(sem_vectors, 50, 200, 3, 200)\n", - "m.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The number of trainable parameters, ~381k, is the number given by Parikh et al, so we're on the right track." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training the model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Parikh et al use tiny batches of 4, training for 50MM batches, which amounts to around 500 epochs. Here we'll use large batches to better use the GPU, and train for fewer epochs -- for purposes of this experiment." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train on 549367 samples, validate on 9824 samples\n", - "Epoch 1/50\n", - "549367/549367 [==============================] - 34s 62us/step - loss: 0.7599 - acc: 0.6617 - val_loss: 0.5396 - val_acc: 0.7861\n", - "Epoch 2/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.5611 - acc: 0.7763 - val_loss: 0.4892 - val_acc: 0.8085\n", - "Epoch 3/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.5212 - acc: 0.7948 - val_loss: 0.4574 - val_acc: 0.8261\n", - "Epoch 4/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4986 - acc: 0.8045 - val_loss: 0.4410 - val_acc: 0.8274\n", - "Epoch 5/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4819 - acc: 0.8114 - val_loss: 0.4224 - val_acc: 0.8383\n", - "Epoch 6/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4714 - acc: 0.8166 - val_loss: 0.4200 - val_acc: 0.8379\n", - "Epoch 7/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4633 - acc: 0.8203 - val_loss: 0.4098 - val_acc: 0.8457\n", - "Epoch 8/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4558 - acc: 0.8232 - val_loss: 0.4114 - val_acc: 0.8415\n", - "Epoch 9/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4508 - acc: 0.8250 - val_loss: 0.4062 - val_acc: 0.8477\n", - "Epoch 10/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4433 - acc: 0.8286 - val_loss: 0.3982 - val_acc: 0.8486\n", - "Epoch 11/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4388 - acc: 0.8307 - val_loss: 0.3953 - val_acc: 0.8497\n", - "Epoch 12/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4351 - acc: 0.8321 - val_loss: 0.3973 - val_acc: 0.8522\n", - "Epoch 13/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4309 - acc: 0.8342 - val_loss: 0.3939 - val_acc: 0.8539\n", - "Epoch 14/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4269 - acc: 0.8355 - val_loss: 0.3932 - val_acc: 0.8517\n", - "Epoch 15/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4247 - acc: 0.8369 - val_loss: 0.3938 - val_acc: 0.8515\n", - "Epoch 16/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4208 - acc: 0.8379 - val_loss: 0.3936 - val_acc: 0.8504\n", - "Epoch 17/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4194 - acc: 0.8390 - val_loss: 0.3885 - val_acc: 0.8560\n", - "Epoch 18/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4162 - acc: 0.8402 - val_loss: 0.3874 - val_acc: 0.8561\n", - "Epoch 19/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4140 - acc: 0.8409 - val_loss: 0.3889 - val_acc: 0.8545\n", - "Epoch 20/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4114 - acc: 0.8426 - val_loss: 0.3864 - val_acc: 0.8583\n", - "Epoch 21/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4092 - acc: 0.8430 - val_loss: 0.3870 - val_acc: 0.8561\n", - "Epoch 22/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4062 - acc: 0.8442 - val_loss: 0.3852 - val_acc: 0.8577\n", - "Epoch 23/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4050 - acc: 0.8450 - val_loss: 0.3850 - val_acc: 0.8578\n", - "Epoch 24/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4035 - acc: 0.8455 - val_loss: 0.3825 - val_acc: 0.8555\n", - "Epoch 25/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4018 - acc: 0.8460 - val_loss: 0.3837 - val_acc: 0.8573\n", - "Epoch 26/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3989 - acc: 0.8476 - val_loss: 0.3843 - val_acc: 0.8599\n", - "Epoch 27/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3979 - acc: 0.8481 - val_loss: 0.3841 - val_acc: 0.8589\n", - "Epoch 28/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3967 - acc: 0.8484 - val_loss: 0.3811 - val_acc: 0.8575\n", - "Epoch 29/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3956 - acc: 0.8492 - val_loss: 0.3829 - val_acc: 0.8589\n", - "Epoch 30/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3938 - acc: 0.8499 - val_loss: 0.3859 - val_acc: 0.8562\n", - "Epoch 31/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3925 - acc: 0.8500 - val_loss: 0.3798 - val_acc: 0.8587\n", - "Epoch 32/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3906 - acc: 0.8509 - val_loss: 0.3834 - val_acc: 0.8569\n", - "Epoch 33/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3893 - acc: 0.8511 - val_loss: 0.3806 - val_acc: 0.8588\n", - "Epoch 34/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3885 - acc: 0.8515 - val_loss: 0.3828 - val_acc: 0.8603\n", - "Epoch 35/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3879 - acc: 0.8520 - val_loss: 0.3800 - val_acc: 0.8594\n", - "Epoch 36/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3860 - acc: 0.8530 - val_loss: 0.3796 - val_acc: 0.8577\n", - "Epoch 37/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3856 - acc: 0.8532 - val_loss: 0.3857 - val_acc: 0.8591\n", - "Epoch 38/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3838 - acc: 0.8535 - val_loss: 0.3835 - val_acc: 0.8603\n", - "Epoch 39/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3830 - acc: 0.8543 - val_loss: 0.3830 - val_acc: 0.8599\n", - "Epoch 40/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3818 - acc: 0.8548 - val_loss: 0.3832 - val_acc: 0.8559\n", - "Epoch 41/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3806 - acc: 0.8551 - val_loss: 0.3845 - val_acc: 0.8553\n", - "Epoch 42/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3803 - acc: 0.8550 - val_loss: 0.3789 - val_acc: 0.8617\n", - "Epoch 43/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3791 - acc: 0.8556 - val_loss: 0.3835 - val_acc: 0.8580\n", - "Epoch 44/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3778 - acc: 0.8565 - val_loss: 0.3799 - val_acc: 0.8580\n", - "Epoch 45/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3766 - acc: 0.8571 - val_loss: 0.3790 - val_acc: 0.8625\n", - "Epoch 46/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3770 - acc: 0.8569 - val_loss: 0.3820 - val_acc: 0.8590\n", - "Epoch 47/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3761 - acc: 0.8573 - val_loss: 0.3831 - val_acc: 0.8581\n", - "Epoch 48/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3739 - acc: 0.8579 - val_loss: 0.3828 - val_acc: 0.8599\n", - "Epoch 49/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3738 - acc: 0.8577 - val_loss: 0.3785 - val_acc: 0.8590\n", - "Epoch 50/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3726 - acc: 0.8580 - val_loss: 0.3820 - val_acc: 0.8585\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=50,validation_data=([text_vectors_test, hypothesis_vectors_test], labels_test))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The result is broadly in the region reported by Parikh et al: ~86 vs 86.3%. The small difference might be accounted by differences in `max_length` (here set at 50), in the training regime, and that here we use Keras' built-in validation splitting rather than the SNLI test set." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Experiment: the asymmetric model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It was suggested earlier that, based on the semantics of entailment, the vector representing the strength of association between the hypothesis to the text is all that is needed for classifying the entailment.\n", - "\n", - "The following model removes consideration of the complementary vector (text to hypothesis) from the computation. This will decrease the paramater count slightly, because the final dense layers will be smaller, and speed up the forward pass when predicting, because fewer calculations will be needed." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "__________________________________________________________________________________________________\n", - "Layer (type) Output Shape Param # Connected to \n", - "==================================================================================================\n", - "words2 (InputLayer) (None, 50) 0 \n", - "__________________________________________________________________________________________________\n", - "words1 (InputLayer) (None, 50) 0 \n", - "__________________________________________________________________________________________________\n", - "sequential_5 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n", - " words2[0][0] \n", - "__________________________________________________________________________________________________\n", - "sequential_6 (Sequential) (None, 50, 200) 80400 sequential_5[1][0] \n", - " sequential_5[2][0] \n", - "__________________________________________________________________________________________________\n", - "dot_4 (Dot) (None, 50, 50) 0 sequential_6[1][0] \n", - " sequential_6[2][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_5 (Lambda) (None, 50, 50) 0 dot_4[0][0] \n", - "__________________________________________________________________________________________________\n", - "dot_5 (Dot) (None, 50, 200) 0 lambda_5[0][0] \n", - " sequential_5[1][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_4 (Concatenate) (None, 50, 400) 0 sequential_5[2][0] \n", - " dot_5[0][0] \n", - "__________________________________________________________________________________________________\n", - "time_distributed_5 (TimeDistrib (None, 50, 200) 120400 concatenate_4[0][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_6 (Lambda) (None, 200) 0 time_distributed_5[0][0] \n", - "__________________________________________________________________________________________________\n", - "sequential_8 (Sequential) (None, 200) 80400 lambda_6[0][0] \n", - "__________________________________________________________________________________________________\n", - "dense_16 (Dense) (None, 3) 603 sequential_8[1][0] \n", - "==================================================================================================\n", - "Total params: 321,663,403\n", - "Trainable params: 341,803\n", - "Non-trainable params: 321,321,600\n", - "__________________________________________________________________________________________________\n" - ] - } - ], - "source": [ - "m1 = build_model(sem_vectors, 50, 200, 3, 200, 'left')\n", - "m1.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The parameter count has indeed decreased by 40,000, corresponding to the 200x200 smaller H function." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train on 549367 samples, validate on 9824 samples\n", - "Epoch 1/50\n", - "549367/549367 [==============================] - 25s 46us/step - loss: 0.7331 - acc: 0.6770 - val_loss: 0.5257 - val_acc: 0.7936\n", - "Epoch 2/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.5518 - acc: 0.7799 - val_loss: 0.4717 - val_acc: 0.8159\n", - "Epoch 3/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.5147 - acc: 0.7967 - val_loss: 0.4449 - val_acc: 0.8278\n", - "Epoch 4/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4948 - acc: 0.8060 - val_loss: 0.4326 - val_acc: 0.8344\n", - "Epoch 5/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4814 - acc: 0.8122 - val_loss: 0.4247 - val_acc: 0.8359\n", - "Epoch 6/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4712 - acc: 0.8162 - val_loss: 0.4143 - val_acc: 0.8430\n", - "Epoch 7/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4635 - acc: 0.8205 - val_loss: 0.4172 - val_acc: 0.8401\n", - "Epoch 8/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4570 - acc: 0.8223 - val_loss: 0.4106 - val_acc: 0.8422\n", - "Epoch 9/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4505 - acc: 0.8259 - val_loss: 0.4043 - val_acc: 0.8451\n", - "Epoch 10/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4459 - acc: 0.8280 - val_loss: 0.4050 - val_acc: 0.8467\n", - "Epoch 11/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4405 - acc: 0.8300 - val_loss: 0.3975 - val_acc: 0.8481\n", - "Epoch 12/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4360 - acc: 0.8324 - val_loss: 0.4026 - val_acc: 0.8496\n", - "Epoch 13/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4327 - acc: 0.8334 - val_loss: 0.4024 - val_acc: 0.8471\n", - "Epoch 14/50\n", - "549367/549367 [==============================] - 24s 45us/step - loss: 0.4293 - acc: 0.8350 - val_loss: 0.3955 - val_acc: 0.8496\n", - "Epoch 15/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4263 - acc: 0.8369 - val_loss: 0.3980 - val_acc: 0.8490\n", - "Epoch 16/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4236 - acc: 0.8377 - val_loss: 0.3958 - val_acc: 0.8496\n", - "Epoch 17/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4213 - acc: 0.8384 - val_loss: 0.3954 - val_acc: 0.8496\n", - "Epoch 18/50\n", - "549367/549367 [==============================] - 24s 45us/step - loss: 0.4187 - acc: 0.8394 - val_loss: 0.3929 - val_acc: 0.8514\n", - "Epoch 19/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4157 - acc: 0.8409 - val_loss: 0.3939 - val_acc: 0.8507\n", - "Epoch 20/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4135 - acc: 0.8417 - val_loss: 0.3953 - val_acc: 0.8522\n", - "Epoch 21/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4122 - acc: 0.8424 - val_loss: 0.3974 - val_acc: 0.8506\n", - "Epoch 22/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4099 - acc: 0.8435 - val_loss: 0.3918 - val_acc: 0.8522\n", - "Epoch 23/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4075 - acc: 0.8443 - val_loss: 0.3901 - val_acc: 0.8513\n", - "Epoch 24/50\n", - "549367/549367 [==============================] - 24s 44us/step - loss: 0.4067 - acc: 0.8447 - val_loss: 0.3885 - val_acc: 0.8543\n", - "Epoch 25/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4047 - acc: 0.8454 - val_loss: 0.3846 - val_acc: 0.8531\n", - "Epoch 26/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4031 - acc: 0.8461 - val_loss: 0.3864 - val_acc: 0.8562\n", - "Epoch 27/50\n", - "549367/549367 [==============================] - 24s 45us/step - loss: 0.4020 - acc: 0.8467 - val_loss: 0.3874 - val_acc: 0.8546\n", - "Epoch 28/50\n", - "549367/549367 [==============================] - 24s 45us/step - loss: 0.4001 - acc: 0.8473 - val_loss: 0.3848 - val_acc: 0.8534\n", - "Epoch 29/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3991 - acc: 0.8479 - val_loss: 0.3865 - val_acc: 0.8562\n", - "Epoch 30/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3976 - acc: 0.8484 - val_loss: 0.3833 - val_acc: 0.8574\n", - "Epoch 31/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3961 - acc: 0.8487 - val_loss: 0.3846 - val_acc: 0.8585\n", - "Epoch 32/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3942 - acc: 0.8498 - val_loss: 0.3805 - val_acc: 0.8573\n", - "Epoch 33/50\n", - "549367/549367 [==============================] - 24s 44us/step - loss: 0.3935 - acc: 0.8503 - val_loss: 0.3856 - val_acc: 0.8579\n", - "Epoch 34/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3923 - acc: 0.8507 - val_loss: 0.3829 - val_acc: 0.8560\n", - "Epoch 35/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3920 - acc: 0.8508 - val_loss: 0.3864 - val_acc: 0.8575\n", - "Epoch 36/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3907 - acc: 0.8516 - val_loss: 0.3873 - val_acc: 0.8563\n", - "Epoch 37/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3891 - acc: 0.8519 - val_loss: 0.3850 - val_acc: 0.8570\n", - "Epoch 38/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3872 - acc: 0.8522 - val_loss: 0.3815 - val_acc: 0.8591\n", - "Epoch 39/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3887 - acc: 0.8520 - val_loss: 0.3829 - val_acc: 0.8590\n", - "Epoch 40/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3868 - acc: 0.8531 - val_loss: 0.3807 - val_acc: 0.8600\n", - "Epoch 41/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3859 - acc: 0.8537 - val_loss: 0.3832 - val_acc: 0.8574\n", - "Epoch 42/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3849 - acc: 0.8537 - val_loss: 0.3850 - val_acc: 0.8576\n", - "Epoch 43/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3834 - acc: 0.8541 - val_loss: 0.3825 - val_acc: 0.8563\n", - "Epoch 44/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3829 - acc: 0.8548 - val_loss: 0.3844 - val_acc: 0.8540\n", - "Epoch 45/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3816 - acc: 0.8552 - val_loss: 0.3841 - val_acc: 0.8559\n", - "Epoch 46/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3816 - acc: 0.8549 - val_loss: 0.3880 - val_acc: 0.8567\n", - "Epoch 47/50\n", - "549367/549367 [==============================] - 24s 45us/step - loss: 0.3799 - acc: 0.8559 - val_loss: 0.3767 - val_acc: 0.8635\n", - "Epoch 48/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3800 - acc: 0.8560 - val_loss: 0.3786 - val_acc: 0.8563\n", - "Epoch 49/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3781 - acc: 0.8563 - val_loss: 0.3812 - val_acc: 0.8596\n", - "Epoch 50/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3788 - acc: 0.8560 - val_loss: 0.3782 - val_acc: 0.8601\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m1.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=50,validation_data=([text_vectors_test, hypothesis_vectors_test], labels_test))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This model performs the same as the slightly more complex model that evaluates alignments in both directions. Note also that processing time is improved, from 64 down to 48 microseconds per step. \n", - "\n", - "Let's now look at an asymmetric model that evaluates text to hypothesis comparisons. The prediction is that such a model will correctly classify a decent proportion of the exemplars, but not as accurately as the previous two.\n", - "\n", - "We'll just use 10 epochs for expediency." - ] - }, - { - "cell_type": "code", - "execution_count": 96, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "__________________________________________________________________________________________________\n", - "Layer (type) Output Shape Param # Connected to \n", - "==================================================================================================\n", - "words1 (InputLayer) (None, 50) 0 \n", - "__________________________________________________________________________________________________\n", - "words2 (InputLayer) (None, 50) 0 \n", - "__________________________________________________________________________________________________\n", - "sequential_13 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n", - " words2[0][0] \n", - "__________________________________________________________________________________________________\n", - "sequential_14 (Sequential) (None, 50, 200) 80400 sequential_13[1][0] \n", - " sequential_13[2][0] \n", - "__________________________________________________________________________________________________\n", - "dot_8 (Dot) (None, 50, 50) 0 sequential_14[1][0] \n", - " sequential_14[2][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_9 (Lambda) (None, 50, 50) 0 dot_8[0][0] \n", - "__________________________________________________________________________________________________\n", - "dot_9 (Dot) (None, 50, 200) 0 lambda_9[0][0] \n", - " sequential_13[2][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_6 (Concatenate) (None, 50, 400) 0 sequential_13[1][0] \n", - " dot_9[0][0] \n", - "__________________________________________________________________________________________________\n", - "time_distributed_9 (TimeDistrib (None, 50, 200) 120400 concatenate_6[0][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_10 (Lambda) (None, 200) 0 time_distributed_9[0][0] \n", - "__________________________________________________________________________________________________\n", - "sequential_16 (Sequential) (None, 200) 80400 lambda_10[0][0] \n", - "__________________________________________________________________________________________________\n", - "dense_32 (Dense) (None, 3) 603 sequential_16[1][0] \n", - "==================================================================================================\n", - "Total params: 321,663,403\n", - "Trainable params: 341,803\n", - "Non-trainable params: 321,321,600\n", - "__________________________________________________________________________________________________\n" - ] - } - ], - "source": [ - "m2 = build_model(sem_vectors, 50, 200, 3, 200, 'right')\n", - "m2.summary()" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train on 455226 samples, validate on 113807 samples\n", - "Epoch 1/10\n", - "455226/455226 [==============================] - 22s 49us/step - loss: 0.8920 - acc: 0.5771 - val_loss: 0.8001 - val_acc: 0.6435\n", - "Epoch 2/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.7808 - acc: 0.6553 - val_loss: 0.7267 - val_acc: 0.6855\n", - "Epoch 3/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.7329 - acc: 0.6825 - val_loss: 0.6966 - val_acc: 0.7006\n", - "Epoch 4/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.7055 - acc: 0.6978 - val_loss: 0.6713 - val_acc: 0.7150\n", - "Epoch 5/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.6862 - acc: 0.7081 - val_loss: 0.6533 - val_acc: 0.7253\n", - "Epoch 6/10\n", - "455226/455226 [==============================] - 21s 47us/step - loss: 0.6694 - acc: 0.7179 - val_loss: 0.6472 - val_acc: 0.7277\n", - "Epoch 7/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.6555 - acc: 0.7252 - val_loss: 0.6338 - val_acc: 0.7347\n", - "Epoch 8/10\n", - "455226/455226 [==============================] - 22s 48us/step - loss: 0.6434 - acc: 0.7310 - val_loss: 0.6246 - val_acc: 0.7385\n", - "Epoch 9/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.6325 - acc: 0.7367 - val_loss: 0.6164 - val_acc: 0.7424\n", - "Epoch 10/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.6216 - acc: 0.7426 - val_loss: 0.6082 - val_acc: 0.7478\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 97, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m2.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=10,validation_split=.2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Comparing this fit to the validation accuracy of the previous two models after 10 epochs, we observe that its accuracy is roughly 10% lower.\n", - "\n", - "It is reassuring that the neural modeling here reproduces what we know from the semantics of natural language!" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/pipeline/custom_attr_methods.py b/examples/pipeline/custom_attr_methods.py deleted file mode 100644 index 7f97bc1c3..000000000 --- a/examples/pipeline/custom_attr_methods.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -"""This example contains several snippets of methods that can be set via custom -Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like -they're "bound" to the object and are partially applied – i.e. the object -they're called on is passed in as the first argument. - -* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -from __future__ import unicode_literals, print_function - -import plac -from spacy.lang.en import English -from spacy.tokens import Doc, Span -from spacy import displacy -from pathlib import Path - - -@plac.annotations( - output_dir=("Output directory for saved HTML", "positional", None, Path) -) -def main(output_dir=None): - nlp = English() # start off with blank English class - - Doc.set_extension("overlap", method=overlap_tokens) - doc1 = nlp("Peach emoji is where it has always been.") - doc2 = nlp("Peach is the superior emoji.") - print("Text 1:", doc1.text) - print("Text 2:", doc2.text) - print("Overlapping tokens:", doc1._.overlap(doc2)) - - Doc.set_extension("to_html", method=to_html) - doc = nlp("This is a sentence about Apple.") - # add entity manually for demo purposes, to make it work without a model - doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings["ORG"])] - print("Text:", doc.text) - doc._.to_html(output=output_dir, style="ent") - - -def to_html(doc, output="/tmp", style="dep"): - """Doc method extension for saving the current state as a displaCy - visualization. - """ - # generate filename from first six non-punct tokens - file_name = "-".join([w.text for w in doc[:6] if not w.is_punct]) + ".html" - html = displacy.render(doc, style=style, page=True) # render markup - if output is not None: - output_path = Path(output) - if not output_path.exists(): - output_path.mkdir() - output_file = Path(output) / file_name - output_file.open("w", encoding="utf-8").write(html) # save to file - print("Saved HTML to {}".format(output_file)) - else: - print(html) - - -def overlap_tokens(doc, other_doc): - """Get the tokens from the original Doc that are also in the comparison Doc. - """ - overlap = [] - other_tokens = [token.text for token in other_doc] - for token in doc: - if token.text in other_tokens: - overlap.append(token) - return overlap - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # Text 1: Peach emoji is where it has always been. - # Text 2: Peach is the superior emoji. - # Overlapping tokens: [Peach, emoji, is, .] diff --git a/examples/pipeline/custom_component_countries_api.py b/examples/pipeline/custom_component_countries_api.py deleted file mode 100644 index 241c0af37..000000000 --- a/examples/pipeline/custom_component_countries_api.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Example of a spaCy v2.0 pipeline component that requests all countries via -the REST Countries API, merges country names into one token, assigns entity -labels and sets attributes on country tokens, e.g. the capital and lat/lng -coordinates. Can be extended with more details from the API. - -* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0) -* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -Prerequisites: pip install requests -""" -from __future__ import unicode_literals, print_function - -import requests -import plac -from spacy.lang.en import English -from spacy.matcher import PhraseMatcher -from spacy.tokens import Doc, Span, Token - - -def main(): - # For simplicity, we start off with only the blank English Language class - # and no model or pre-defined pipeline loaded. - nlp = English() - rest_countries = RESTCountriesComponent(nlp) # initialise component - nlp.add_pipe(rest_countries) # add it to the pipeline - doc = nlp("Some text about Colombia and the Czech Republic") - print("Pipeline", nlp.pipe_names) # pipeline contains component name - print("Doc has countries", doc._.has_country) # Doc contains countries - for token in doc: - if token._.is_country: - print( - token.text, - token._.country_capital, - token._.country_latlng, - token._.country_flag, - ) # country data - print("Entities", [(e.text, e.label_) for e in doc.ents]) # entities - - -class RESTCountriesComponent(object): - """spaCy v2.0 pipeline component that requests all countries via - the REST Countries API, merges country names into one token, assigns entity - labels and sets attributes on country tokens. - """ - - name = "rest_countries" # component name, will show up in the pipeline - - def __init__(self, nlp, label="GPE"): - """Initialise the pipeline component. The shared nlp instance is used - to initialise the matcher with the shared vocab, get the label ID and - generate Doc objects as phrase match patterns. - """ - # Make request once on initialisation and store the data - r = requests.get("https://restcountries.eu/rest/v2/all") - r.raise_for_status() # make sure requests raises an error if it fails - countries = r.json() - - # Convert API response to dict keyed by country name for easy lookup - # This could also be extended using the alternative and foreign language - # names provided by the API - self.countries = {c["name"]: c for c in countries} - self.label = nlp.vocab.strings[label] # get entity label ID - - # Set up the PhraseMatcher with Doc patterns for each country name - patterns = [nlp(c) for c in self.countries.keys()] - self.matcher = PhraseMatcher(nlp.vocab) - self.matcher.add("COUNTRIES", None, *patterns) - - # Register attribute on the Token. We'll be overwriting this based on - # the matches, so we're only setting a default value, not a getter. - # If no default value is set, it defaults to None. - Token.set_extension("is_country", default=False) - Token.set_extension("country_capital", default=False) - Token.set_extension("country_latlng", default=False) - Token.set_extension("country_flag", default=False) - - # Register attributes on Doc and Span via a getter that checks if one of - # the contained tokens is set to is_country == True. - Doc.set_extension("has_country", getter=self.has_country) - Span.set_extension("has_country", getter=self.has_country) - - def __call__(self, doc): - """Apply the pipeline component on a Doc object and modify it if matches - are found. Return the Doc, so it can be processed by the next component - in the pipeline, if available. - """ - matches = self.matcher(doc) - spans = [] # keep the spans for later so we can merge them afterwards - for _, start, end in matches: - # Generate Span representing the entity & set label - entity = Span(doc, start, end, label=self.label) - spans.append(entity) - # Set custom attribute on each token of the entity - # Can be extended with other data returned by the API, like - # currencies, country code, flag, calling code etc. - for token in entity: - token._.set("is_country", True) - token._.set("country_capital", self.countries[entity.text]["capital"]) - token._.set("country_latlng", self.countries[entity.text]["latlng"]) - token._.set("country_flag", self.countries[entity.text]["flag"]) - # Overwrite doc.ents and add entity – be careful not to replace! - doc.ents = list(doc.ents) + [entity] - for span in spans: - # Iterate over all spans and merge them into one token. This is done - # after setting the entities – otherwise, it would cause mismatched - # indices! - span.merge() - return doc # don't forget to return the Doc! - - def has_country(self, tokens): - """Getter for Doc and Span attributes. Returns True if one of the tokens - is a country. Since the getter is only called when we access the - attribute, we can refer to the Token's 'is_country' attribute here, - which is already set in the processing step.""" - return any([t._.get("is_country") for t in tokens]) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # Pipeline ['rest_countries'] - # Doc has countries True - # Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg - # Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg - # Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')] diff --git a/examples/pipeline/custom_component_entities.py b/examples/pipeline/custom_component_entities.py deleted file mode 100644 index a53b688b0..000000000 --- a/examples/pipeline/custom_component_entities.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Example of a spaCy v2.0 pipeline component that sets entity annotations -based on list of single or multiple-word company names. Companies are -labelled as ORG and their spans are merged into one token. Additionally, -._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token -respectively. - -* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -from __future__ import unicode_literals, print_function - -import plac -from spacy.lang.en import English -from spacy.matcher import PhraseMatcher -from spacy.tokens import Doc, Span, Token - - -@plac.annotations( - text=("Text to process", "positional", None, str), - companies=("Names of technology companies", "positional", None, str), -) -def main(text="Alphabet Inc. is the company behind Google.", *companies): - # For simplicity, we start off with only the blank English Language class - # and no model or pre-defined pipeline loaded. - nlp = English() - if not companies: # set default companies if none are set via args - companies = ["Alphabet Inc.", "Google", "Netflix", "Apple"] # etc. - component = TechCompanyRecognizer(nlp, companies) # initialise component - nlp.add_pipe(component, last=True) # add last to the pipeline - - doc = nlp(text) - print("Pipeline", nlp.pipe_names) # pipeline contains component name - print("Tokens", [t.text for t in doc]) # company names from the list are merged - print("Doc has_tech_org", doc._.has_tech_org) # Doc contains tech orgs - print("Token 0 is_tech_org", doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org - print("Token 1 is_tech_org", doc[1]._.is_tech_org) # "is" is not - print("Entities", [(e.text, e.label_) for e in doc.ents]) # all orgs are entities - - -class TechCompanyRecognizer(object): - """Example of a spaCy v2.0 pipeline component that sets entity annotations - based on list of single or multiple-word company names. Companies are - labelled as ORG and their spans are merged into one token. Additionally, - ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token - respectively.""" - - name = "tech_companies" # component name, will show up in the pipeline - - def __init__(self, nlp, companies=tuple(), label="ORG"): - """Initialise the pipeline component. The shared nlp instance is used - to initialise the matcher with the shared vocab, get the label ID and - generate Doc objects as phrase match patterns. - """ - self.label = nlp.vocab.strings[label] # get entity label ID - - # Set up the PhraseMatcher – it can now take Doc objects as patterns, - # so even if the list of companies is long, it's very efficient - patterns = [nlp(org) for org in companies] - self.matcher = PhraseMatcher(nlp.vocab) - self.matcher.add("TECH_ORGS", None, *patterns) - - # Register attribute on the Token. We'll be overwriting this based on - # the matches, so we're only setting a default value, not a getter. - Token.set_extension("is_tech_org", default=False) - - # Register attributes on Doc and Span via a getter that checks if one of - # the contained tokens is set to is_tech_org == True. - Doc.set_extension("has_tech_org", getter=self.has_tech_org) - Span.set_extension("has_tech_org", getter=self.has_tech_org) - - def __call__(self, doc): - """Apply the pipeline component on a Doc object and modify it if matches - are found. Return the Doc, so it can be processed by the next component - in the pipeline, if available. - """ - matches = self.matcher(doc) - spans = [] # keep the spans for later so we can merge them afterwards - for _, start, end in matches: - # Generate Span representing the entity & set label - entity = Span(doc, start, end, label=self.label) - spans.append(entity) - # Set custom attribute on each token of the entity - for token in entity: - token._.set("is_tech_org", True) - # Overwrite doc.ents and add entity – be careful not to replace! - doc.ents = list(doc.ents) + [entity] - for span in spans: - # Iterate over all spans and merge them into one token. This is done - # after setting the entities – otherwise, it would cause mismatched - # indices! - span.merge() - return doc # don't forget to return the Doc! - - def has_tech_org(self, tokens): - """Getter for Doc and Span attributes. Returns True if one of the tokens - is a tech org. Since the getter is only called when we access the - attribute, we can refer to the Token's 'is_tech_org' attribute here, - which is already set in the processing step.""" - return any([t._.get("is_tech_org") for t in tokens]) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # Pipeline ['tech_companies'] - # Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.'] - # Doc has_tech_org True - # Token 0 is_tech_org True - # Token 1 is_tech_org False - # Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')] diff --git a/examples/pipeline/custom_sentence_segmentation.py b/examples/pipeline/custom_sentence_segmentation.py deleted file mode 100644 index ff59ab187..000000000 --- a/examples/pipeline/custom_sentence_segmentation.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Example of adding a pipeline component to prohibit sentence boundaries -before certain tokens. - -What we do is write to the token.is_sent_start attribute, which -takes values in {True, False, None}. The default value None allows the parser -to predict sentence segments. The value False prohibits the parser from inserting -a sentence boundary before that token. Note that fixing the sentence segmentation -should also improve the parse quality. - -The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627 -Other versions of the model may not make the original mistake, so the specific -example might not be apt for future versions. - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -import plac -import spacy - - -def prevent_sentence_boundaries(doc): - for token in doc: - if not can_be_sentence_start(token): - token.is_sent_start = False - return doc - - -def can_be_sentence_start(token): - if token.i == 0: - return True - # We're not checking for is_title here to ignore arbitrary titlecased - # tokens within sentences - # elif token.is_title: - # return True - elif token.nbor(-1).is_punct: - return True - elif token.nbor(-1).is_space: - return True - else: - return False - - -@plac.annotations( - text=("The raw text to process", "positional", None, str), - spacy_model=("spaCy model to use (with a parser)", "option", "m", str), -) -def main(text="Been here And I'm loving it.", spacy_model="en_core_web_lg"): - print("Using spaCy model '{}'".format(spacy_model)) - print("Processing text '{}'".format(text)) - nlp = spacy.load(spacy_model) - doc = nlp(text) - sentences = [sent.text.strip() for sent in doc.sents] - print("Before:", sentences) - nlp.add_pipe(prevent_sentence_boundaries, before="parser") - doc = nlp(text) - sentences = [sent.text.strip() for sent in doc.sents] - print("After:", sentences) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/pipeline/fix_space_entities.py b/examples/pipeline/fix_space_entities.py deleted file mode 100644 index 686253eca..000000000 --- a/examples/pipeline/fix_space_entities.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Demonstrate adding a rule-based component that forces some tokens to not -be entities, before the NER tagger is applied. This is used to hotfix the issue -in https://github.com/explosion/spaCy/issues/2870, present as of spaCy v2.0.16. - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -from __future__ import unicode_literals - -import spacy -from spacy.attrs import ENT_IOB - - -def fix_space_tags(doc): - ent_iobs = doc.to_array([ENT_IOB]) - for i, token in enumerate(doc): - if token.is_space: - # Sets 'O' tag (0 is None, so I is 1, O is 2) - ent_iobs[i] = 2 - doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1))) - return doc - - -def main(): - nlp = spacy.load("en_core_web_sm") - text = "This is some crazy test where I dont need an Apple Watch to make things bug" - doc = nlp(text) - print("Before", doc.ents) - nlp.add_pipe(fix_space_tags, name="fix-ner", before="ner") - doc = nlp(text) - print("After", doc.ents) - - -if __name__ == "__main__": - main() diff --git a/examples/pipeline/multi_processing.py b/examples/pipeline/multi_processing.py deleted file mode 100644 index e4aca7912..000000000 --- a/examples/pipeline/multi_processing.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Example of multi-processing with Joblib. Here, we're exporting -part-of-speech-tagged, true-cased, (very roughly) sentence-separated text, with -each "sentence" on a newline, and spaces between tokens. Data is loaded from -the IMDB movie reviews dataset and will be loaded automatically via Thinc's -built-in dataset loader. - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -Prerequisites: pip install joblib -""" -from __future__ import print_function, unicode_literals - -from pathlib import Path - -import ml_datasets -from joblib import Parallel, delayed -from functools import partial -import plac -import spacy -from spacy.util import minibatch - - -@plac.annotations( - output_dir=("Output directory", "positional", None, Path), - model=("Model name (needs tagger)", "positional", None, str), - n_jobs=("Number of workers", "option", "n", int), - batch_size=("Batch-size for each process", "option", "b", int), - limit=("Limit of entries from the dataset", "option", "l", int), -) -def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10000): - nlp = spacy.load(model) # load spaCy model - print("Loaded model '%s'" % model) - if not output_dir.exists(): - output_dir.mkdir() - # load and pre-process the IMBD dataset - print("Loading IMDB data...") - data, _ = ml_datasets.imdb() - texts, _ = zip(*data[-limit:]) - print("Processing texts...") - partitions = minibatch(texts, size=batch_size) - executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes") - do = delayed(partial(transform_texts, nlp)) - tasks = (do(i, batch, output_dir) for i, batch in enumerate(partitions)) - executor(tasks) - - -def transform_texts(nlp, batch_id, texts, output_dir): - print(nlp.pipe_names) - out_path = Path(output_dir) / ("%d.txt" % batch_id) - if out_path.exists(): # return None in case same batch is called again - return None - print("Processing batch", batch_id) - with out_path.open("w", encoding="utf8") as f: - for doc in nlp.pipe(texts): - f.write(" ".join(represent_word(w) for w in doc if not w.is_space)) - f.write("\n") - print("Saved {} texts to {}.txt".format(len(texts), batch_id)) - - -def represent_word(word): - text = word.text - # True-case, i.e. try to normalize sentence-initial capitals. - # Only do this if the lower-cased form is more probable. - if ( - text.istitle() - and is_sent_begin(word) - and word.prob < word.doc.vocab[text.lower()].prob - ): - text = text.lower() - return text + "|" + word.tag_ - - -def is_sent_begin(word): - if word.i == 0: - return True - elif word.i >= 2 and word.nbor(-1).text in (".", "!", "?", "..."): - return True - else: - return False - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/streamlit_spacy.py b/examples/streamlit_spacy.py deleted file mode 100644 index 2b527b3df..000000000 --- a/examples/streamlit_spacy.py +++ /dev/null @@ -1,165 +0,0 @@ -# coding: utf-8 -""" -Example of a Streamlit app for an interactive spaCy model visualizer. You can -either download the script, or point `streamlit run` to the raw URL of this -file. For more details, see https://streamlit.io. - -Installation: -pip install streamlit -python -m spacy download en_core_web_sm -python -m spacy download en_core_web_md -python -m spacy download de_core_news_sm - -Usage: -streamlit run streamlit_spacy.py -""" -from __future__ import unicode_literals - -import base64 - -import streamlit as st -import spacy -from spacy import displacy -import pandas as pd - - -SPACY_MODEL_NAMES = ["en_core_web_sm", "en_core_web_md", "de_core_news_sm"] -DEFAULT_TEXT = "Mark Zuckerberg is the CEO of Facebook." -HTML_WRAPPER = """
{}
""" - - -@st.cache(allow_output_mutation=True) -def load_model(name): - return spacy.load(name) - - -@st.cache(allow_output_mutation=True) -def process_text(model_name, text): - nlp = load_model(model_name) - return nlp(text) - - -st.sidebar.title("Interactive spaCy visualizer") -st.sidebar.markdown( - """ -Process text with [spaCy](https://spacy.io) models and visualize named entities, -dependencies and more. Uses spaCy's built-in -[displaCy](http://spacy.io/usage/visualizers) visualizer under the hood. -""" -) - -spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES) -model_load_state = st.info(f"Loading model '{spacy_model}'...") -nlp = load_model(spacy_model) -model_load_state.empty() - -text = st.text_area("Text to analyze", DEFAULT_TEXT) -doc = process_text(spacy_model, text) - - -def render_svg(svg): - """Renders the given svg string.""" - b64 = base64.b64encode(svg.encode('utf-8')).decode("utf-8") - html = r'' % b64 - st.write(html, unsafe_allow_html=True) - - -if "parser" in nlp.pipe_names: - st.header("Dependency Parse & Part-of-speech tags") - st.sidebar.header("Dependency Parse") - split_sents = st.sidebar.checkbox("Split sentences", value=True) - collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True) - collapse_phrases = st.sidebar.checkbox("Collapse phrases") - compact = st.sidebar.checkbox("Compact mode") - options = { - "collapse_punct": collapse_punct, - "collapse_phrases": collapse_phrases, - "compact": compact, - } - docs = [span.as_doc() for span in doc.sents] if split_sents else [doc] - for sent in docs: - html = displacy.render(sent, options=options, style="dep") - # Double newlines seem to mess with the rendering - html = html.replace("\n\n", "\n") - if split_sents and len(docs) > 1: - st.markdown(f"> {sent.text}") - render_svg(html) - # this didn't show the dep arc labels properly, cf #5089 - # st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) - -if "ner" in nlp.pipe_names: - st.header("Named Entities") - st.sidebar.header("Named Entities") - label_set = nlp.get_pipe("ner").labels - labels = st.sidebar.multiselect( - "Entity labels", options=label_set, default=list(label_set) - ) - html = displacy.render(doc, style="ent", options={"ents": labels}) - # Newlines seem to mess with the rendering - html = html.replace("\n", " ") - st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) - attrs = ["text", "label_", "start", "end", "start_char", "end_char"] - if "entity_linker" in nlp.pipe_names: - attrs.append("kb_id_") - data = [ - [str(getattr(ent, attr)) for attr in attrs] - for ent in doc.ents - if ent.label_ in labels - ] - df = pd.DataFrame(data, columns=attrs) - st.dataframe(df) - - -if "textcat" in nlp.pipe_names: - st.header("Text Classification") - st.markdown(f"> {text}") - df = pd.DataFrame(doc.cats.items(), columns=("Label", "Score")) - st.dataframe(df) - - -vector_size = nlp.meta.get("vectors", {}).get("width", 0) -if vector_size: - st.header("Vectors & Similarity") - st.code(nlp.meta["vectors"]) - text1 = st.text_input("Text or word 1", "apple") - text2 = st.text_input("Text or word 2", "orange") - doc1 = process_text(spacy_model, text1) - doc2 = process_text(spacy_model, text2) - similarity = doc1.similarity(doc2) - if similarity > 0.5: - st.success(similarity) - else: - st.error(similarity) - -st.header("Token attributes") - -if st.button("Show token attributes"): - attrs = [ - "idx", - "text", - "lemma_", - "pos_", - "tag_", - "dep_", - "head", - "ent_type_", - "ent_iob_", - "shape_", - "is_alpha", - "is_ascii", - "is_digit", - "is_punct", - "like_num", - ] - data = [[str(getattr(token, attr)) for attr in attrs] for token in doc] - df = pd.DataFrame(data, columns=attrs) - st.dataframe(df) - - -st.header("JSON Doc") -if st.button("Show JSON Doc"): - st.json(doc.to_json()) - -st.header("JSON model meta") -if st.button("Show JSON model meta"): - st.json(nlp.meta) diff --git a/examples/training/conllu-config.json b/examples/training/conllu-config.json deleted file mode 100644 index 9a11dd96b..000000000 --- a/examples/training/conllu-config.json +++ /dev/null @@ -1 +0,0 @@ -{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0} diff --git a/examples/training/conllu.py b/examples/training/conllu.py deleted file mode 100644 index a398b0ae0..000000000 --- a/examples/training/conllu.py +++ /dev/null @@ -1,404 +0,0 @@ -"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes -.conllu format for development data, allowing the official scorer to be used. -""" -from __future__ import unicode_literals -import plac -import attr -from pathlib import Path -import re -import json -import tqdm - -import spacy -import spacy.util -from spacy.tokens import Token, Doc -from spacy.gold import Example -from spacy.pipeline._parser_internals.nonproj import projectivize -from collections import defaultdict -from spacy.matcher import Matcher - -import itertools -import random -import numpy.random - -from bin.ud import conll17_ud_eval - -import spacy.lang.zh -import spacy.lang.ja - -spacy.lang.zh.Chinese.Defaults.use_jieba = False -spacy.lang.ja.Japanese.Defaults.use_janome = False - -random.seed(0) -numpy.random.seed(0) - - -################ -# Data reading # -################ - -space_re = re.compile("\s+") - - -def split_text(text): - return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")] - - -def read_data( - nlp, - conllu_file, - text_file, - raw_text=True, - oracle_segments=False, - max_doc_length=None, - limit=None, -): - """Read the CONLLU format into Example objects. If raw_text=True, - include Doc objects created using nlp.make_doc and then aligned against - the gold-standard sequences. If oracle_segments=True, include Doc objects - created from the gold-standard segments. At least one must be True.""" - if not raw_text and not oracle_segments: - raise ValueError("At least one of raw_text or oracle_segments must be True") - paragraphs = split_text(text_file.read()) - conllu = read_conllu(conllu_file) - # sd is spacy doc; cd is conllu doc - # cs is conllu sent, ct is conllu token - docs = [] - golds = [] - for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)): - sent_annots = [] - for cs in cd: - sent = defaultdict(list) - for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs: - if "." in id_: - continue - if "-" in id_: - continue - id_ = int(id_) - 1 - head = int(head) - 1 if head != "0" else id_ - sent["words"].append(word) - sent["tags"].append(tag) - sent["heads"].append(head) - sent["deps"].append("ROOT" if dep == "root" else dep) - sent["spaces"].append(space_after == "_") - sent["entities"] = ["-"] * len(sent["words"]) - sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"]) - if oracle_segments: - docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"])) - golds.append(sent) - - sent_annots.append(sent) - if raw_text and max_doc_length and len(sent_annots) >= max_doc_length: - doc, gold = _make_gold(nlp, None, sent_annots) - sent_annots = [] - docs.append(doc) - golds.append(gold) - if limit and len(docs) >= limit: - return golds_to_gold_data(docs, golds) - - if raw_text and sent_annots: - doc, gold = _make_gold(nlp, None, sent_annots) - docs.append(doc) - golds.append(gold) - if limit and len(docs) >= limit: - return golds_to_gold_data(docs, golds) - return golds_to_gold_data(docs, golds) - - -def read_conllu(file_): - docs = [] - sent = [] - doc = [] - for line in file_: - if line.startswith("# newdoc"): - if doc: - docs.append(doc) - doc = [] - elif line.startswith("#"): - continue - elif not line.strip(): - if sent: - doc.append(sent) - sent = [] - else: - sent.append(list(line.strip().split("\t"))) - if len(sent[-1]) != 10: - print(repr(line)) - raise ValueError - if sent: - doc.append(sent) - if doc: - docs.append(doc) - return docs - - -def _make_gold(nlp, text, sent_annots): - # Flatten the conll annotations, and adjust the head indices - gold = defaultdict(list) - for sent in sent_annots: - gold["heads"].extend(len(gold["words"]) + head for head in sent["heads"]) - for field in ["words", "tags", "deps", "entities", "spaces"]: - gold[field].extend(sent[field]) - # Construct text if necessary - assert len(gold["words"]) == len(gold["spaces"]) - if text is None: - text = "".join( - word + " " * space for word, space in zip(gold["words"], gold["spaces"]) - ) - doc = nlp.make_doc(text) - gold.pop("spaces") - return doc, gold - - -############################# -# Data transforms for spaCy # -############################# - - -def golds_to_gold_data(docs, golds): - """Get out the training data format used by begin_training.""" - data = [] - for doc, gold in zip(docs, golds): - example = Example.from_dict(doc, gold) - data.append(example) - return data - - -############## -# Evaluation # -############## - - -def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): - with text_loc.open("r", encoding="utf8") as text_file: - texts = split_text(text_file.read()) - docs = list(nlp.pipe(texts)) - with sys_loc.open("w", encoding="utf8") as out_file: - write_conllu(docs, out_file) - with gold_loc.open("r", encoding="utf8") as gold_file: - gold_ud = conll17_ud_eval.load_conllu(gold_file) - with sys_loc.open("r", encoding="utf8") as sys_file: - sys_ud = conll17_ud_eval.load_conllu(sys_file) - scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) - return scores - - -def write_conllu(docs, file_): - merger = Matcher(docs[0].vocab) - merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) - for i, doc in enumerate(docs): - matches = merger(doc) - spans = [doc[start : end + 1] for _, start, end in matches] - offsets = [(span.start_char, span.end_char) for span in spans] - for start_char, end_char in offsets: - doc.merge(start_char, end_char) - file_.write("# newdoc id = {i}\n".format(i=i)) - for j, sent in enumerate(doc.sents): - file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) - file_.write("# text = {text}\n".format(text=sent.text)) - for k, token in enumerate(sent): - file_.write(token._.get_conllu_lines(k) + "\n") - file_.write("\n") - - -def print_progress(itn, losses, ud_scores): - fields = { - "dep_loss": losses.get("parser", 0.0), - "tag_loss": losses.get("tagger", 0.0), - "words": ud_scores["Words"].f1 * 100, - "sents": ud_scores["Sentences"].f1 * 100, - "tags": ud_scores["XPOS"].f1 * 100, - "uas": ud_scores["UAS"].f1 * 100, - "las": ud_scores["LAS"].f1 * 100, - } - header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"] - if itn == 0: - print("\t".join(header)) - tpl = "\t".join( - ( - "{:d}", - "{dep_loss:.1f}", - "{las:.1f}", - "{uas:.1f}", - "{tags:.1f}", - "{sents:.1f}", - "{words:.1f}", - ) - ) - print(tpl.format(itn, **fields)) - - -# def get_sent_conllu(sent, sent_id): -# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)] - - -def get_token_conllu(token, i): - if token._.begins_fused: - n = 1 - while token.nbor(n)._.inside_fused: - n += 1 - id_ = "%d-%d" % (i, i + n) - lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"] - else: - lines = [] - if token.head.i == token.i: - head = 0 - else: - head = i + (token.head.i - token.i) + 1 - fields = [ - str(i + 1), - token.text, - token.lemma_, - token.pos_, - token.tag_, - "_", - str(head), - token.dep_.lower(), - "_", - "_", - ] - lines.append("\t".join(fields)) - return "\n".join(lines) - - -################## -# Initialization # -################## - - -def load_nlp(corpus, config): - lang = corpus.split("_")[0] - nlp = spacy.blank(lang) - if config.vectors: - nlp.vocab.from_disk(config.vectors / "vocab") - return nlp - - -def initialize_pipeline(nlp, examples, config): - nlp.add_pipe(nlp.create_pipe("parser")) - if config.multitask_tag: - nlp.parser.add_multitask_objective("tag") - if config.multitask_sent: - nlp.parser.add_multitask_objective("sent_start") - nlp.parser.moves.add_action(2, "subtok") - nlp.add_pipe(nlp.create_pipe("tagger")) - for eg in examples: - for tag in eg.get_aligned("TAG", as_string=True): - if tag is not None: - nlp.tagger.add_label(tag) - # Replace labels that didn't make the frequency cutoff - actions = set(nlp.parser.labels) - label_set = set([act.split("-")[1] for act in actions if "-" in act]) - for eg in examples: - gold = eg.gold - for i, label in enumerate(gold.labels): - if label is not None and label not in label_set: - gold.labels[i] = label.split("||")[0] - return nlp.begin_training(lambda: examples) - - -######################## -# Command line helpers # -######################## - - -@attr.s -class Config(object): - vectors = attr.ib(default=None) - max_doc_length = attr.ib(default=10) - multitask_tag = attr.ib(default=True) - multitask_sent = attr.ib(default=True) - nr_epoch = attr.ib(default=30) - batch_size = attr.ib(default=1000) - dropout = attr.ib(default=0.2) - - @classmethod - def load(cls, loc): - with Path(loc).open("r", encoding="utf8") as file_: - cfg = json.load(file_) - return cls(**cfg) - - -class Dataset(object): - def __init__(self, path, section): - self.path = path - self.section = section - self.conllu = None - self.text = None - for file_path in self.path.iterdir(): - name = file_path.parts[-1] - if section in name and name.endswith("conllu"): - self.conllu = file_path - elif section in name and name.endswith("txt"): - self.text = file_path - if self.conllu is None: - msg = "Could not find .txt file in {path} for {section}" - raise IOError(msg.format(section=section, path=path)) - if self.text is None: - msg = "Could not find .txt file in {path} for {section}" - self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0] - - -class TreebankPaths(object): - def __init__(self, ud_path, treebank, **cfg): - self.train = Dataset(ud_path / treebank, "train") - self.dev = Dataset(ud_path / treebank, "dev") - self.lang = self.train.lang - - -@plac.annotations( - ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), - parses_dir=("Directory to write the development parses", "positional", None, Path), - config=("Path to json formatted config file", "positional", None, Config.load), - corpus=( - "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora", - "positional", - None, - str, - ), - limit=("Size limit", "option", "n", int), -) -def main(ud_dir, parses_dir, config, corpus, limit=0): - Token.set_extension("get_conllu_lines", method=get_token_conllu) - Token.set_extension("begins_fused", default=False) - Token.set_extension("inside_fused", default=False) - - Token.set_extension("get_conllu_lines", method=get_token_conllu) - Token.set_extension("begins_fused", default=False) - Token.set_extension("inside_fused", default=False) - - paths = TreebankPaths(ud_dir, corpus) - if not (parses_dir / corpus).exists(): - (parses_dir / corpus).mkdir() - print("Train and evaluate", corpus, "using lang", paths.lang) - nlp = load_nlp(paths.lang, config) - - examples = read_data( - nlp, - paths.train.conllu.open(encoding="utf8"), - paths.train.text.open(encoding="utf8"), - max_doc_length=config.max_doc_length, - limit=limit, - ) - - optimizer = initialize_pipeline(nlp, examples, config) - - for i in range(config.nr_epoch): - batches = spacy.minibatch_by_words(examples, size=config.batch_size) - losses = {} - n_train_words = sum(len(eg.reference.doc) for eg in examples) - with tqdm.tqdm(total=n_train_words, leave=False) as pbar: - for batch in batches: - pbar.update(sum(len(eg.reference.doc) for eg in batch)) - nlp.update( - examples=batch, sgd=optimizer, drop=config.dropout, losses=losses, - ) - - out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i) - with nlp.use_params(optimizer.averages): - scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path) - print_progress(i, losses, scores) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/training/create_kb.py b/examples/training/create_kb.py deleted file mode 100644 index a455c8d7e..000000000 --- a/examples/training/create_kb.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 - -"""Example of defining a knowledge base in spaCy, -which is needed to implement entity linking functionality. - -For more details, see the documentation: -* Knowledge base: https://spacy.io/api/kb -* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking - -Compatible with: spaCy v2.2.4 -Last tested with: v2.2.4 -""" -from __future__ import unicode_literals, print_function - -import plac -from pathlib import Path - -from spacy.vocab import Vocab -import spacy -from spacy.kb import KnowledgeBase - - -# Q2146908 (Russ Cochran): American golfer -# Q7381115 (Russ Cochran): publisher -ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)} - - -@plac.annotations( - model=("Model name, should have pretrained word embeddings", "positional", None, str), - output_dir=("Optional output directory", "option", "o", Path), -) -def main(model, output_dir=None): - """Load the model and create the KB with pre-defined entity encodings. - If an output_dir is provided, the KB will be stored there in a file 'kb'. - The updated vocab will also be written to a directory in the output_dir.""" - - nlp = spacy.load(model) # load existing spaCy model - print("Loaded model '%s'" % model) - - # check the length of the nlp vectors - if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: - raise ValueError( - "The `nlp` object should have access to pretrained word vectors, " - " cf. https://spacy.io/usage/models#languages." - ) - - # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality. - # For simplicity, we'll just use the original vector dimension here instead. - vectors_dim = nlp.vocab.vectors.shape[1] - kb = KnowledgeBase(nlp.vocab, entity_vector_length=vectors_dim) - - # set up the data - entity_ids = [] - descr_embeddings = [] - freqs = [] - for key, value in ENTITIES.items(): - desc, freq = value - entity_ids.append(key) - descr_embeddings.append(nlp(desc).vector) - freqs.append(freq) - - # set the entities, can also be done by calling `kb.add_entity` for each entity - kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=descr_embeddings) - - # adding aliases, the entities need to be defined in the KB beforehand - kb.add_alias( - alias="Russ Cochran", - entities=["Q2146908", "Q7381115"], - probabilities=[0.24, 0.7], # the sum of these probabilities should not exceed 1 - ) - - # test the trained model - print() - _print_kb(kb) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - kb_path = str(output_dir / "kb") - kb.to_disk(kb_path) - print() - print("Saved KB to", kb_path) - - vocab_path = output_dir / "vocab" - kb.vocab.to_disk(vocab_path) - print("Saved vocab to", vocab_path) - - print() - - # test the saved model - # always reload a knowledge base with the same vocab instance! - print("Loading vocab from", vocab_path) - print("Loading KB from", kb_path) - vocab2 = Vocab().from_disk(vocab_path) - kb2 = KnowledgeBase(vocab2, entity_vector_length=1) - kb2.from_disk(kb_path) - print() - _print_kb(kb2) - - -def _print_kb(kb): - print(kb.get_size_entities(), "kb entities:", kb.get_entity_strings()) - print(kb.get_size_aliases(), "kb aliases:", kb.get_alias_strings()) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # 2 kb entities: ['Q2146908', 'Q7381115'] - # 1 kb aliases: ['Russ Cochran'] diff --git a/examples/training/ner_multitask_objective.py b/examples/training/ner_multitask_objective.py deleted file mode 100644 index baa6d7f06..000000000 --- a/examples/training/ner_multitask_objective.py +++ /dev/null @@ -1,88 +0,0 @@ -"""This example shows how to add a multi-task objective that is trained -alongside the entity recognizer. This is an alternative to adding features -to the model. - -The multi-task idea is to train an auxiliary model to predict some attribute, -with weights shared between the auxiliary model and the main model. In this -example, we're predicting the position of the word in the document. - -The model that predicts the position of the word encourages the convolutional -layers to include the position information in their representation. The -information is then available to the main model, as a feature. - -The overall idea is that we might know something about what sort of features -we'd like the CNN to extract. The multi-task objectives can encourage the -extraction of this type of feature. The multi-task objective is only used -during training. We discard the auxiliary model before run-time. - -The specific example here is not necessarily a good idea --- but it shows -how an arbitrary objective function for some word can be used. - -Developed and tested for spaCy 2.0.6. Updated for v2.2.2 -""" -import random -import plac -import spacy -import os.path - -from spacy.gold.example import Example -from spacy.tokens import Doc -from spacy.gold import read_json_file - -random.seed(0) - -PWD = os.path.dirname(__file__) - -TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json"))) - - -def get_position_label(i, token_annotation): - """Return labels indicating the position of the word in the document. - """ - if len(token_annotation.words) < 20: - return "short-doc" - elif i == 0: - return "first-word" - elif i < 10: - return "early-word" - elif i < 20: - return "mid-word" - elif i == len(token_annotation.words) - 1: - return "last-word" - else: - return "late-word" - - -def main(n_iter=10): - nlp = spacy.blank("en") - ner = nlp.create_pipe("ner") - ner.add_multitask_objective(get_position_label) - nlp.add_pipe(ner) - print(nlp.pipeline) - - print("Create data", len(TRAIN_DATA)) - optimizer = nlp.begin_training() - for itn in range(n_iter): - random.shuffle(TRAIN_DATA) - losses = {} - for example_dict in TRAIN_DATA: - doc = Doc(nlp.vocab, words=example_dict["words"]) - example = Example.from_dict(doc, example_dict) - nlp.update( - examples=[example], # 1 example - drop=0.2, # dropout - make it harder to memorise data - sgd=optimizer, # callable to update weights - losses=losses, - ) - print(losses.get("nn_labeller", 0.0), losses["ner"]) - - # test the trained model - for example_dict in TRAIN_DATA: - if "text" in example_dict: - doc = nlp(example_dict["text"]) - print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) - print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py deleted file mode 100644 index a7eb120c9..000000000 --- a/examples/training/rehearsal.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Prevent catastrophic forgetting with rehearsal updates.""" -import plac -import random -import warnings -import srsly -import spacy -from spacy.gold import Example -from spacy.util import minibatch, compounding - -# TODO: further fix & test this script for v.3 ? (read_gold_data is never called) - -LABEL = "ANIMAL" -TRAIN_DATA = [ - ( - "Horses are too tall and they pretend to care about your feelings", - {"entities": [(0, 6, "ANIMAL")]}, - ), - ("Do they bite?", {"entities": []}), - ( - "horses are too tall and they pretend to care about your feelings", - {"entities": [(0, 6, "ANIMAL")]}, - ), - ("horses pretend to care about your feelings", {"entities": [(0, 6, "ANIMAL")]}), - ( - "they pretend to care about your feelings, those horses", - {"entities": [(48, 54, "ANIMAL")]}, - ), - ("horses?", {"entities": [(0, 6, "ANIMAL")]}), -] - - -def read_raw_data(nlp, jsonl_loc): - for json_obj in srsly.read_jsonl(jsonl_loc): - if json_obj["text"].strip(): - doc = nlp.make_doc(json_obj["text"]) - yield Example.from_dict(doc, {}) - - -def read_gold_data(nlp, gold_loc): - examples = [] - for json_obj in srsly.read_jsonl(gold_loc): - doc = nlp.make_doc(json_obj["text"]) - ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]] - example = Example.from_dict(doc, {"entities": ents}) - examples.append(example) - return examples - - -def main(model_name, unlabelled_loc): - n_iter = 10 - dropout = 0.2 - batch_size = 4 - nlp = spacy.load(model_name) - nlp.get_pipe("ner").add_label(LABEL) - raw_examples = list(read_raw_data(nlp, unlabelled_loc)) - optimizer = nlp.resume_training() - # Avoid use of Adam when resuming training. I don't understand this well - # yet, but I'm getting weird results from Adam. Try commenting out the - # nlp.update(), and using Adam -- you'll find the models drift apart. - # I guess Adam is losing precision, introducing gradient noise? - optimizer.learn_rate = 0.1 - optimizer.b1 = 0.0 - optimizer.b2 = 0.0 - sizes = compounding(1.0, 4.0, 1.001) - - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - - with nlp.select_pipes(enable="ner") and warnings.catch_warnings(): - # show warnings for misaligned entity spans once - warnings.filterwarnings("once", category=UserWarning, module="spacy") - - for itn in range(n_iter): - random.shuffle(train_examples) - random.shuffle(raw_examples) - losses = {} - r_losses = {} - # batch up the examples using spaCy's minibatch - raw_batches = minibatch(raw_examples, size=4) - for batch in minibatch(train_examples, size=sizes): - nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses) - raw_batch = list(next(raw_batches)) - nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses) - print("Losses", losses) - print("R. Losses", r_losses) - print(nlp.get_pipe("ner").model.unseen_classes) - test_text = "Do you like horses?" - doc = nlp(test_text) - print("Entities in '%s'" % test_text) - for ent in doc.ents: - print(ent.label_, ent.text) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py deleted file mode 100644 index d2bd61e5b..000000000 --- a/examples/training/train_entity_linker.py +++ /dev/null @@ -1,177 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 - -"""Example of training spaCy's entity linker, starting off with a predefined -knowledge base and corresponding vocab, and a blank English model. - -For more details, see the documentation: -* Training: https://spacy.io/usage/training -* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking - -Compatible with: spaCy v2.2.4 -Last tested with: v2.2.4 -""" -from __future__ import unicode_literals, print_function - -import plac -import random -from pathlib import Path -import spacy - -from spacy.gold import Example -from spacy.pipeline import EntityRuler -from spacy.util import minibatch, compounding - - -def sample_train_data(): - train_data = [] - - # Q2146908 (Russ Cochran): American golfer - # Q7381115 (Russ Cochran): publisher - - text_1 = "Russ Cochran his reprints include EC Comics." - dict_1 = {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}} - train_data.append((text_1, {"links": dict_1})) - - text_2 = "Russ Cochran has been publishing comic art." - dict_2 = {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}} - train_data.append((text_2, {"links": dict_2})) - - text_3 = "Russ Cochran captured his first major title with his son as caddie." - dict_3 = {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}} - train_data.append((text_3, {"links": dict_3})) - - text_4 = "Russ Cochran was a member of University of Kentucky's golf team." - dict_4 = {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}} - train_data.append((text_4, {"links": dict_4})) - - return train_data - - -# training data -TRAIN_DATA = sample_train_data() - - -@plac.annotations( - kb_path=("Path to the knowledge base", "positional", None, Path), - vocab_path=("Path to the vocab for the kb", "positional", None, Path), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(kb_path, vocab_path, output_dir=None, n_iter=50): - """Create a blank model with the specified vocab, set up the pipeline and train the entity linker. - The `vocab` should be the one used during creation of the KB.""" - # create blank English model with correct vocab - nlp = spacy.blank("en") - nlp.vocab.from_disk(vocab_path) - nlp.vocab.vectors.name = "spacy_pretrained_vectors" - print("Created blank 'en' model with vocab from '%s'" % vocab_path) - - # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy. - nlp.add_pipe(nlp.create_pipe("sentencizer")) - - # Add a custom component to recognize "Russ Cochran" as an entity for the example training data. - # Note that in a realistic application, an actual NER algorithm should be used instead. - ruler = EntityRuler(nlp) - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]} - ] - ruler.add_patterns(patterns) - nlp.add_pipe(ruler) - - # Create the Entity Linker component and add it to the pipeline. - if "entity_linker" not in nlp.pipe_names: - print("Loading Knowledge Base from '%s'" % kb_path) - cfg = { - "kb_loader": { - "@assets": "spacy.KBFromFile.v1", - "vocab_path": vocab_path, - "kb_path": kb_path, - }, - # use only the predicted EL score and not the prior probability (for demo purposes) - "incl_prior": False, - } - entity_linker = nlp.create_pipe("entity_linker", cfg) - nlp.add_pipe(entity_linker, last=True) - - # Convert the texts to docs to make sure we have doc.ents set for the training examples. - # Also ensure that the annotated examples correspond to known identifiers in the knowledge base. - kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings() - train_examples = [] - for text, annotation in TRAIN_DATA: - with nlp.select_pipes(disable="entity_linker"): - doc = nlp(text) - annotation_clean = annotation - for offset, kb_id_dict in annotation["links"].items(): - new_dict = {} - for kb_id, value in kb_id_dict.items(): - if kb_id in kb_ids: - new_dict[kb_id] = value - else: - print( - "Removed", kb_id, "from training because it is not in the KB." - ) - annotation_clean["links"][offset] = new_dict - train_examples.append(Example.from_dict(doc, annotation_clean)) - - with nlp.select_pipes(enable="entity_linker"): # only train entity linker - # reset and initialize the weights randomly - optimizer = nlp.begin_training() - - for itn in range(n_iter): - random.shuffle(train_examples) - losses = {} - # batch up the examples using spaCy's minibatch - batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update( - batch, - drop=0.2, # dropout - make it harder to memorise data - losses=losses, - sgd=optimizer, - ) - print(itn, "Losses", losses) - - # test the trained model - _apply_model(nlp) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - print() - print("Saved model to", output_dir) - - # test the saved model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - _apply_model(nlp2) - - -def _apply_model(nlp): - for text, annotation in TRAIN_DATA: - # apply the entity linker which will now make predictions for the 'Russ Cochran' entities - doc = nlp(text) - print() - print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents]) - print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc]) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output (can be shuffled): - - # Entities[('Russ Cochran', 'PERSON', 'Q7381115')] - # Tokens[('Russ', 'PERSON', 'Q7381115'), ('Cochran', 'PERSON', 'Q7381115'), ("his", '', ''), ('reprints', '', ''), ('include', '', ''), ('The', '', ''), ('Complete', '', ''), ('EC', '', ''), ('Library', '', ''), ('.', '', '')] - - # Entities[('Russ Cochran', 'PERSON', 'Q7381115')] - # Tokens[('Russ', 'PERSON', 'Q7381115'), ('Cochran', 'PERSON', 'Q7381115'), ('has', '', ''), ('been', '', ''), ('publishing', '', ''), ('comic', '', ''), ('art', '', ''), ('.', '', '')] - - # Entities[('Russ Cochran', 'PERSON', 'Q2146908')] - # Tokens[('Russ', 'PERSON', 'Q2146908'), ('Cochran', 'PERSON', 'Q2146908'), ('captured', '', ''), ('his', '', ''), ('first', '', ''), ('major', '', ''), ('title', '', ''), ('with', '', ''), ('his', '', ''), ('son', '', ''), ('as', '', ''), ('caddie', '', ''), ('.', '', '')] - - # Entities[('Russ Cochran', 'PERSON', 'Q2146908')] - # Tokens[('Russ', 'PERSON', 'Q2146908'), ('Cochran', 'PERSON', 'Q2146908'), ('was', '', ''), ('a', '', ''), ('member', '', ''), ('of', '', ''), ('University', '', ''), ('of', '', ''), ('Kentucky', '', ''), ("'s", '', ''), ('golf', '', ''), ('team', '', ''), ('.', '', '')] diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py deleted file mode 100644 index fffa140f4..000000000 --- a/examples/training/train_intent_parser.py +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -"""Using the parser to recognise your own semantics - -spaCy's parser component can be trained to predict any type of tree -structure over your input text. You can also predict trees over whole documents -or chat logs, with connections between the sentence-roots used to annotate -discourse structure. In this example, we'll build a message parser for a common -"chat intent": finding local businesses. Our message semantics will have the -following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION. - -"show me the best hotel in berlin" -('show', 'ROOT', 'show') -('best', 'QUALITY', 'hotel') --> hotel with QUALITY best -('hotel', 'PLACE', 'show') --> show PLACE hotel -('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin - -Compatible with: spaCy v2.0.0+ -""" -from __future__ import unicode_literals, print_function - -import plac -import random -from pathlib import Path -import spacy -from spacy.gold import Example -from spacy.util import minibatch, compounding - - -# training data: texts, heads and dependency labels -# for no relation, we simply chose an arbitrary dependency label, e.g. '-' -TRAIN_DATA = [ - ( - "find a cafe with great wifi", - { - "heads": [0, 2, 0, 5, 5, 2], # index of token head - "deps": ["ROOT", "-", "PLACE", "-", "QUALITY", "ATTRIBUTE"], - }, - ), - ( - "find a hotel near the beach", - { - "heads": [0, 2, 0, 5, 5, 2], - "deps": ["ROOT", "-", "PLACE", "QUALITY", "-", "ATTRIBUTE"], - }, - ), - ( - "find me the closest gym that's open late", - { - "heads": [0, 0, 4, 4, 0, 6, 4, 6, 6], - "deps": [ - "ROOT", - "-", - "-", - "QUALITY", - "PLACE", - "-", - "-", - "ATTRIBUTE", - "TIME", - ], - }, - ), - ( - "show me the cheapest store that sells flowers", - { - "heads": [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store! - "deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "-", "PRODUCT"], - }, - ), - ( - "find a nice restaurant in london", - { - "heads": [0, 3, 3, 0, 3, 3], - "deps": ["ROOT", "-", "QUALITY", "PLACE", "-", "LOCATION"], - }, - ), - ( - "show me the coolest hostel in berlin", - { - "heads": [0, 0, 4, 4, 0, 4, 4], - "deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "LOCATION"], - }, - ), - ( - "find a good italian restaurant near work", - { - "heads": [0, 4, 4, 4, 0, 4, 5], - "deps": [ - "ROOT", - "-", - "QUALITY", - "ATTRIBUTE", - "PLACE", - "ATTRIBUTE", - "LOCATION", - ], - }, - ), -] - - -@plac.annotations( - model=("Model name. Defaults to blank 'en' model.", "option", "m", str), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(model=None, output_dir=None, n_iter=15): - """Load the model, set up the pipeline and train the parser.""" - if model is not None: - nlp = spacy.load(model) # load existing spaCy model - print("Loaded model '%s'" % model) - else: - nlp = spacy.blank("en") # create blank Language class - print("Created blank 'en' model") - - # We'll use the built-in dependency parser class, but we want to create a - # fresh instance – just in case. - if "parser" in nlp.pipe_names: - nlp.remove_pipe("parser") - parser = nlp.create_pipe("parser") - nlp.add_pipe(parser, first=True) - - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for dep in annotations.get("deps", []): - parser.add_label(dep) - - with nlp.select_pipes(enable="parser"): # only train parser - optimizer = nlp.begin_training() - for itn in range(n_iter): - random.shuffle(train_examples) - losses = {} - # batch up the examples using spaCy's minibatch - batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update(batch, sgd=optimizer, losses=losses) - print("Losses", losses) - - # test the trained model - test_model(nlp) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the saved model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - test_model(nlp2) - - -def test_model(nlp): - texts = [ - "find a hotel with good wifi", - "find me the cheapest gym near work", - "show me the best hotel in berlin", - ] - docs = nlp.pipe(texts) - for doc in docs: - print(doc.text) - print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"]) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # find a hotel with good wifi - # [ - # ('find', 'ROOT', 'find'), - # ('hotel', 'PLACE', 'find'), - # ('good', 'QUALITY', 'wifi'), - # ('wifi', 'ATTRIBUTE', 'hotel') - # ] - # find me the cheapest gym near work - # [ - # ('find', 'ROOT', 'find'), - # ('cheapest', 'QUALITY', 'gym'), - # ('gym', 'PLACE', 'find'), - # ('near', 'ATTRIBUTE', 'gym'), - # ('work', 'LOCATION', 'near') - # ] - # show me the best hotel in berlin - # [ - # ('show', 'ROOT', 'show'), - # ('best', 'QUALITY', 'hotel'), - # ('hotel', 'PLACE', 'show'), - # ('berlin', 'LOCATION', 'hotel') - # ] diff --git a/examples/training/train_morphologizer.py b/examples/training/train_morphologizer.py deleted file mode 100644 index 8c39a28a6..000000000 --- a/examples/training/train_morphologizer.py +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -""" -A simple example for training a morphologizer. For more details, see -the documentation: -* Training: https://spacy.io/usage/training - -Compatible with: spaCy v3.0.0+ -Last tested with: v3.0.0 -""" -from __future__ import unicode_literals, print_function - -import plac -import random -from pathlib import Path -import spacy -from spacy.gold import Example -from spacy.util import minibatch, compounding -from spacy.morphology import Morphology - - -# Usually you'll read this in, of course. Data formats vary. Ensure your -# strings are unicode and that the number of tags assigned matches spaCy's -# tokenization. If not, you can always add a 'words' key to the annotations -# that specifies the gold-standard tokenization, e.g.: -# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']}) -TRAIN_DATA = [ - ( - "I like green eggs", - { - "morphs": [ - "PronType=Prs|Person=1", - "VerbForm=Fin", - "Degree=Pos", - "Number=Plur", - ], - "pos": ["PRON", "VERB", "ADJ", "NOUN"], - }, - ), - ( - "Eat blue ham", - { - "morphs": ["VerbForm=Inf", "Degree=Pos", "Number=Sing"], - "pos": ["VERB", "ADJ", "NOUN"], - }, - ), - ( - "She was blue", - { - "morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos"], - "pos": ["PRON", "VERB", "ADJ"], - }, - ), - ( - "He was blue today", - { - "morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos", ""], - "pos": ["PRON", "VERB", "ADJ", "ADV"], - }, - ), -] - -# The POS tags are optional, set `with_pos_tags = False` to omit them for -# this example: -with_pos_tags = True - -if not with_pos_tags: - for i in range(len(TRAIN_DATA)): - del TRAIN_DATA[i][1]["pos"] - - -@plac.annotations( - lang=("ISO Code of language to use", "option", "l", str), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(lang="en", output_dir=None, n_iter=25): - """Create a new model, set up the pipeline and train the tagger. In order to - train the tagger with a custom tag map, we're creating a new Language - instance with a custom vocab. - """ - nlp = spacy.blank(lang) - # add the tagger to the pipeline - # nlp.create_pipe works for built-ins that are registered with spaCy - morphologizer = nlp.create_pipe("morphologizer") - nlp.add_pipe(morphologizer) - - # add labels and create the Example instances - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - morph_labels = annotations.get("morphs") - pos_labels = annotations.get("pos", [""] * len(annotations.get("morphs"))) - assert len(morph_labels) == len(pos_labels) - for morph, pos in zip(morph_labels, pos_labels): - morph_dict = Morphology.feats_to_dict(morph) - if pos: - morph_dict["POS"] = pos - morph = Morphology.dict_to_feats(morph_dict) - morphologizer.add_label(morph) - - optimizer = nlp.begin_training() - for i in range(n_iter): - random.shuffle(train_examples) - losses = {} - # batch up the examples using spaCy's minibatch - batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update(batch, sgd=optimizer, losses=losses) - print("Losses", losses) - - # test the trained model - test_text = "I like blue eggs" - doc = nlp(test_text) - print("Morphs", [(t.text, t.morph) for t in doc]) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the save model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - doc = nlp2(test_text) - print("Morphs", [(t.text, t.morph) for t in doc]) - - -if __name__ == "__main__": - plac.call(main) - -# Expected output: -# Morphs [('I', POS=PRON|Person=1|PronType=Prs), ('like', POS=VERB|VerbForm=Fin), ('blue', Degree=Pos|POS=ADJ), ('eggs', Number=Plur|POS=NOUN)] diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py deleted file mode 100644 index 26b283777..000000000 --- a/examples/training/train_ner.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Example of training spaCy's named entity recognizer, starting off with an -existing model or a blank model. - -For more details, see the documentation: -* Training: https://spacy.io/usage/training -* NER: https://spacy.io/usage/linguistic-features#named-entities - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.2.4 -""" -from __future__ import unicode_literals, print_function - -import plac -import random -import warnings -from pathlib import Path -import spacy -from spacy.gold import Example -from spacy.util import minibatch, compounding - - -# training data -TRAIN_DATA = [ - ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), - ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), -] - - -@plac.annotations( - model=("Model name. Defaults to blank 'en' model.", "option", "m", str), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(model=None, output_dir=None, n_iter=100): - """Load the model, set up the pipeline and train the entity recognizer.""" - if model is not None: - nlp = spacy.load(model) # load existing spaCy model - print("Loaded model '%s'" % model) - else: - nlp = spacy.blank("en") # create blank Language class - print("Created blank 'en' model") - - # create the built-in pipeline components and add them to the pipeline - # nlp.create_pipe works for built-ins that are registered with spaCy - if "simple_ner" not in nlp.pipe_names: - ner = nlp.create_pipe("simple_ner") - nlp.add_pipe(ner, last=True) - # otherwise, get it so we can add labels - else: - ner = nlp.get_pipe("simple_ner") - - # add labels and create Example objects - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for ent in annotations.get("entities"): - print("Add label", ent[2]) - ner.add_label(ent[2]) - - with nlp.select_pipes(enable="simple_ner") and warnings.catch_warnings(): - # show warnings for misaligned entity spans once - warnings.filterwarnings("once", category=UserWarning, module="spacy") - - # reset and initialize the weights randomly – but only if we're - # training a new model - if model is None: - nlp.begin_training() - print( - "Transitions", list(enumerate(nlp.get_pipe("simple_ner").get_tag_names())) - ) - for itn in range(n_iter): - random.shuffle(train_examples) - losses = {} - # batch up the examples using spaCy's minibatch - batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update( - batch, - drop=0.0, # dropout - make it harder to memorise data - losses=losses, - ) - print("Losses", losses) - - # test the trained model - for text, _ in TRAIN_DATA: - doc = nlp(text) - print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) - print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the saved model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - for text, _ in TRAIN_DATA: - doc = nlp2(text) - print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) - print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # Entities [('Shaka Khan', 'PERSON')] - # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3), - # ('Khan', 'PERSON', 1), ('?', '', 2)] - # Entities [('London', 'LOC'), ('Berlin', 'LOC')] - # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), - # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)] diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py deleted file mode 100644 index c4edafac4..000000000 --- a/examples/training/train_new_entity_type.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Example of training an additional entity type - -This script shows how to add a new entity type to an existing pretrained NER -model. To keep the example short and simple, only four sentences are provided -as examples. In practice, you'll need many more — a few hundred would be a -good start. You will also likely need to mix in examples of other entity -types, which might be obtained by running the entity recognizer over unlabelled -sentences, and adding their annotations to the training set. - -The actual training is performed by looping over the examples, and calling -`nlp.entity.update()`. The `update()` method steps through the words of the -input. At each word, it makes a prediction. It then consults the annotations -provided on the GoldParse instance, to see whether it was right. If it was -wrong, it adjusts its weights so that the correct action will score higher -next time. - -After training your model, you can save it to a directory. We recommend -wrapping models as Python packages, for ease of deployment. - -For more details, see the documentation: -* Training: https://spacy.io/usage/training -* NER: https://spacy.io/usage/linguistic-features#named-entities - -Compatible with: spaCy v2.1.0+ -Last tested with: v2.2.4 -""" -from __future__ import unicode_literals, print_function - -import plac -import random -import warnings -from pathlib import Path -import spacy -from spacy.util import minibatch, compounding - - -# new entity label -LABEL = "ANIMAL" - -# training data -# Note: If you're using an existing model, make sure to mix in examples of -# other entity types that spaCy correctly recognized before. Otherwise, your -# model might learn the new type, but "forget" what it previously knew. -# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting -TRAIN_DATA = [ - ( - "Horses are too tall and they pretend to care about your feelings", - {"entities": [(0, 6, LABEL)]}, - ), - ("Do they bite?", {"entities": []}), - ( - "horses are too tall and they pretend to care about your feelings", - {"entities": [(0, 6, LABEL)]}, - ), - ("horses pretend to care about your feelings", {"entities": [(0, 6, LABEL)]}), - ( - "they pretend to care about your feelings, those horses", - {"entities": [(48, 54, LABEL)]}, - ), - ("horses?", {"entities": [(0, 6, LABEL)]}), -] - - -@plac.annotations( - model=("Model name. Defaults to blank 'en' model.", "option", "m", str), - new_model_name=("New model name for model meta.", "option", "nm", str), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): - """Set up the pipeline and entity recognizer, and train the new entity.""" - random.seed(0) - if model is not None: - nlp = spacy.load(model) # load existing spaCy model - print("Loaded model '%s'" % model) - else: - nlp = spacy.blank("en") # create blank Language class - print("Created blank 'en' model") - # Add entity recognizer to model if it's not in the pipeline - # nlp.create_pipe works for built-ins that are registered with spaCy - train_examples = [] - for text, annotation in TRAIN_DATA: - train_examples.append(TRAIN_DATA.from_dict(nlp(text), annotation)) - - if "ner" not in nlp.pipe_names: - ner = nlp.create_pipe("ner") - nlp.add_pipe(ner) - # otherwise, get it, so we can add labels to it - else: - ner = nlp.get_pipe("ner") - - ner.add_label(LABEL) # add new entity label to entity recognizer - # Adding extraneous labels shouldn't mess anything up - ner.add_label("VEGETABLE") - if model is None: - optimizer = nlp.begin_training() - else: - optimizer = nlp.resume_training() - move_names = list(ner.move_names) - with nlp.select_pipes(enable="ner") and warnings.catch_warnings(): - # show warnings for misaligned entity spans once - warnings.filterwarnings("once", category=UserWarning, module="spacy") - - sizes = compounding(1.0, 4.0, 1.001) - # batch up the examples using spaCy's minibatch - for itn in range(n_iter): - random.shuffle(train_examples) - batches = minibatch(train_examples, size=sizes) - losses = {} - for batch in batches: - nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses) - print("Losses", losses) - - # test the trained model - test_text = "Do you like horses?" - doc = nlp(test_text) - print("Entities in '%s'" % test_text) - for ent in doc.ents: - print(ent.label_, ent.text) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.meta["name"] = new_model_name # rename model - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the saved model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - # Check the classes have loaded back consistently - assert nlp2.get_pipe("ner").move_names == move_names - doc2 = nlp2(test_text) - for ent in doc2.ents: - print(ent.label_, ent.text) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py deleted file mode 100644 index d46a8f4b9..000000000 --- a/examples/training/train_parser.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Example of training spaCy dependency parser, starting off with an existing -model or a blank model. For more details, see the documentation: -* Training: https://spacy.io/usage/training -* Dependency Parse: https://spacy.io/usage/linguistic-features#dependency-parse - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -from __future__ import unicode_literals, print_function - -import plac -import random -from pathlib import Path -import spacy -from spacy.gold import Example -from spacy.util import minibatch, compounding - - -# training data -TRAIN_DATA = [ - ( - "They trade mortgage-backed securities.", - { - "heads": [1, 1, 4, 4, 5, 1, 1], - "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"], - }, - ), - ( - "I like London and Berlin.", - { - "heads": [1, 1, 1, 2, 2, 1], - "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], - }, - ), -] - - -@plac.annotations( - model=("Model name. Defaults to blank 'en' model.", "option", "m", str), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(model=None, output_dir=None, n_iter=15): - """Load the model, set up the pipeline and train the parser.""" - if model is not None: - nlp = spacy.load(model) # load existing spaCy model - print("Loaded model '%s'" % model) - else: - nlp = spacy.blank("en") # create blank Language class - print("Created blank 'en' model") - - # add the parser to the pipeline if it doesn't exist - # nlp.create_pipe works for built-ins that are registered with spaCy - if "parser" not in nlp.pipe_names: - parser = nlp.create_pipe("parser") - nlp.add_pipe(parser, first=True) - # otherwise, get it, so we can add labels to it - else: - parser = nlp.get_pipe("parser") - - # add labels to the parser and create the Example objects - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for dep in annotations.get("deps", []): - parser.add_label(dep) - - with nlp.select_pipes(enable="parser"): # only train parser - optimizer = nlp.begin_training() - for itn in range(n_iter): - random.shuffle(train_examples) - losses = {} - # batch up the examples using spaCy's minibatch - batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update(batch, sgd=optimizer, losses=losses) - print("Losses", losses) - - # test the trained model - test_text = "I like securities." - doc = nlp(test_text) - print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc]) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the saved model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - doc = nlp2(test_text) - print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc]) - - -if __name__ == "__main__": - plac.call(main) - - # expected result: - # [ - # ('I', 'nsubj', 'like'), - # ('like', 'ROOT', 'like'), - # ('securities', 'dobj', 'like'), - # ('.', 'punct', 'like') - # ] diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py deleted file mode 100644 index 4eeb77fb9..000000000 --- a/examples/training/train_tagger.py +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -""" -A simple example for training a part-of-speech tagger with a custom tag map. -To allow us to update the tag map with our custom one, this example starts off -with a blank Language class and modifies its defaults. For more details, see -the documentation: -* Training: https://spacy.io/usage/training -* POS Tagging: https://spacy.io/usage/linguistic-features#pos-tagging - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -from __future__ import unicode_literals, print_function - -import plac -import random -from pathlib import Path -import spacy -from spacy.gold import Example -from spacy.util import minibatch, compounding - - -# You need to define a mapping from your data's part-of-speech tag names to the -# Universal Part-of-Speech tag set, as spaCy includes an enum of these tags. -# See here for the Universal Tag Set: -# http://universaldependencies.github.io/docs/u/pos/index.html -# You may also specify morphological features for your tags, from the universal -# scheme. -TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}} - -# Usually you'll read this in, of course. Data formats vary. Ensure your -# strings are unicode and that the number of tags assigned matches spaCy's -# tokenization. If not, you can always add a 'words' key to the annotations -# that specifies the gold-standard tokenization, e.g.: -# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']}) -TRAIN_DATA = [ - ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), - ("Eat blue ham", {"tags": ["V", "J", "N"]}), -] - - -@plac.annotations( - lang=("ISO Code of language to use", "option", "l", str), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(lang="en", output_dir=None, n_iter=25): - """Create a new model, set up the pipeline and train the tagger. In order to - train the tagger with a custom tag map, we're creating a new Language - instance with a custom vocab. - """ - nlp = spacy.blank(lang) - # add the tagger to the pipeline - # nlp.create_pipe works for built-ins that are registered with spaCy - tagger = nlp.create_pipe("tagger") - # Add the tags. This needs to be done before you start training. - for tag, values in TAG_MAP.items(): - tagger.add_label(tag, values) - nlp.add_pipe(tagger) - - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - - optimizer = nlp.begin_training() - for i in range(n_iter): - random.shuffle(train_examples) - losses = {} - # batch up the examples using spaCy's minibatch - batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update(batch, sgd=optimizer, losses=losses) - print("Losses", losses) - - # test the trained model - test_text = "I like blue eggs" - doc = nlp(test_text) - print("Tags", [(t.text, t.tag_, t.pos_) for t in doc]) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the save model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - doc = nlp2(test_text) - print("Tags", [(t.text, t.tag_, t.pos_) for t in doc]) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # [ - # ('I', 'N', 'NOUN'), - # ('like', 'V', 'VERB'), - # ('blue', 'J', 'ADJ'), - # ('eggs', 'N', 'NOUN') - # ] diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py deleted file mode 100644 index 901b382bf..000000000 --- a/examples/training/train_textcat.py +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Train a convolutional neural network text classifier on the -IMDB dataset, using the TextCategorizer component. The dataset will be loaded -automatically via the package `ml_datasets`. The model is added to -spacy.pipeline, and predictions are available via `doc.cats`. For more details, -see the documentation: -* Training: https://spacy.io/usage/training - -Compatible with: spaCy v3.0.0+ -""" -from __future__ import unicode_literals, print_function - -import plac -import random -from pathlib import Path -from ml_datasets import loaders - -import spacy -from spacy import util -from spacy.util import minibatch, compounding -from spacy.gold import Example -from thinc.api import Config - - -@plac.annotations( - config_path=("Path to config file", "positional", None, Path), - output_dir=("Optional output directory", "option", "o", Path), - n_texts=("Number of texts to train from", "option", "t", int), - n_iter=("Number of training iterations", "option", "n", int), - init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path), - dataset=("Dataset to train on (default: imdb)", "option", "d", str), - threshold=("Min. number of instances for a given label (default 20)", "option", "m", int) -) -def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None, dataset="imdb", threshold=20): - if not config_path or not config_path.exists(): - raise ValueError(f"Config file not found at {config_path}") - - spacy.util.fix_random_seed() - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - - print(f"Loading nlp model from {config_path}") - nlp_config = Config().from_disk(config_path) - nlp, _ = util.load_model_from_config(nlp_config, auto_fill=True) - - # ensure the nlp object was defined with a textcat component - if "textcat" not in nlp.pipe_names: - raise ValueError(f"The nlp definition in the config does not contain a textcat component") - - textcat = nlp.get_pipe("textcat") - - # load the dataset - print(f"Loading dataset {dataset} ...") - (train_texts, train_cats), (dev_texts, dev_cats) = load_data(dataset=dataset, threshold=threshold, limit=n_texts) - print( - "Using {} examples ({} training, {} evaluation)".format( - n_texts, len(train_texts), len(dev_texts) - ) - ) - train_examples = [] - for text, cats in zip(train_texts, train_cats): - doc = nlp.make_doc(text) - example = Example.from_dict(doc, {"cats": cats}) - for cat in cats: - textcat.add_label(cat) - train_examples.append(example) - - with nlp.select_pipes(enable="textcat"): # only train textcat - optimizer = nlp.begin_training() - if init_tok2vec is not None: - with init_tok2vec.open("rb") as file_: - textcat.model.get_ref("tok2vec").from_bytes(file_.read()) - print("Training the model...") - print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) - batch_sizes = compounding(4.0, 32.0, 1.001) - for i in range(n_iter): - losses = {} - # batch up the examples using spaCy's minibatch - random.shuffle(train_examples) - batches = minibatch(train_examples, size=batch_sizes) - for batch in batches: - nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses) - with textcat.model.use_params(optimizer.averages): - # evaluate on the dev data split off in load_data() - scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) - print( - "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table - losses["textcat"], - scores["textcat_p"], - scores["textcat_r"], - scores["textcat_f"], - ) - ) - - # test the trained model (only makes sense for sentiment analysis) - test_text = "This movie sucked" - doc = nlp(test_text) - print(test_text, doc.cats) - - if output_dir is not None: - with nlp.use_params(optimizer.averages): - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the saved model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - doc2 = nlp2(test_text) - print(test_text, doc2.cats) - - -def load_data(dataset, threshold, limit=0, split=0.8): - """Load data from the provided dataset.""" - # Partition off part of the train data for evaluation - data_loader = loaders.get(dataset) - train_data, _ = data_loader(limit=int(limit/split)) - random.shuffle(train_data) - texts, labels = zip(*train_data) - - unique_labels = set() - for label_set in labels: - if isinstance(label_set, int) or isinstance(label_set, str): - unique_labels.add(label_set) - elif isinstance(label_set, list) or isinstance(label_set, set): - unique_labels.update(label_set) - unique_labels = sorted(unique_labels) - print(f"# of unique_labels: {len(unique_labels)}") - - count_values_train = dict() - for text, annot_list in train_data: - if isinstance(annot_list, int) or isinstance(annot_list, str): - count_values_train[annot_list] = count_values_train.get(annot_list, 0) + 1 - else: - for annot in annot_list: - count_values_train[annot] = count_values_train.get(annot, 0) + 1 - for value, count in sorted(count_values_train.items(), key=lambda item: item[1]): - if count < threshold: - unique_labels.remove(value) - - print(f"# of unique_labels after filtering with threshold {threshold}: {len(unique_labels)}") - - if unique_labels == {0, 1}: - cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels] - else: - cats = [] - for y in labels: - if isinstance(y, str) or isinstance(y, int): - cats.append({str(label): (label == y) for label in unique_labels}) - elif isinstance(y, set): - cats.append({str(label): (label in y) for label in unique_labels}) - else: - raise ValueError(f"Unrecognised type of labels: {type(y)}") - - split = int(len(train_data) * split) - return (texts[:split], cats[:split]), (texts[split:], cats[split:]) - - -def evaluate(tokenizer, textcat, texts, cats): - docs = (tokenizer(text) for text in texts) - tp = 0.0 # True positives - fp = 1e-8 # False positives - fn = 1e-8 # False negatives - tn = 0.0 # True negatives - for i, doc in enumerate(textcat.pipe(docs)): - gold = cats[i] - for label, score in doc.cats.items(): - if label not in gold: - continue - if label == "NEGATIVE": - continue - if score >= 0.5 and gold[label] >= 0.5: - tp += 1.0 - elif score >= 0.5 and gold[label] < 0.5: - fp += 1.0 - elif score < 0.5 and gold[label] < 0.5: - tn += 1 - elif score < 0.5 and gold[label] >= 0.5: - fn += 1 - precision = tp / (tp + fp) - recall = tp / (tp + fn) - if (precision + recall) == 0: - f_score = 0.0 - else: - f_score = 2 * (precision * recall) / (precision + recall) - return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score} - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/training/train_textcat_config.cfg b/examples/training/train_textcat_config.cfg deleted file mode 100644 index a1f4e91ce..000000000 --- a/examples/training/train_textcat_config.cfg +++ /dev/null @@ -1,14 +0,0 @@ -[nlp] -lang = "en" -pipeline = ["textcat"] - -[components] - -[components.textcat] -factory = "textcat" - -[components.textcat.model] -@architectures = "spacy.TextCatBOW.v1" -exclusive_classes = true -ngram_size = 1 -no_output_layer = false diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py deleted file mode 100644 index 9b34811f7..000000000 --- a/examples/vectors_fast_text.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Load vectors for a language trained using fastText -https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md -Compatible with: spaCy v2.0.0+ -""" -from __future__ import unicode_literals -import plac -import numpy - -import spacy -from spacy.language import Language - - -@plac.annotations( - vectors_loc=("Path to .vec file", "positional", None, str), - lang=( - "Optional language ID. If not set, blank Language() will be used.", - "positional", - None, - str, - ), -) -def main(vectors_loc, lang=None): - if lang is None: - nlp = Language() - else: - # create empty language class – this is required if you're planning to - # save the model to disk and load it back later (models always need a - # "lang" setting). Use 'xx' for blank multi-language class. - nlp = spacy.blank(lang) - with open(vectors_loc, "rb") as file_: - header = file_.readline() - nr_row, nr_dim = header.split() - nlp.vocab.reset_vectors(width=int(nr_dim)) - for line in file_: - line = line.rstrip().decode("utf8") - pieces = line.rsplit(" ", int(nr_dim)) - word = pieces[0] - vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f") - nlp.vocab.set_vector(word, vector) # add the vectors to the vocab - # test the vectors and similarity - text = "class colspan" - doc = nlp(text) - print(text, doc[0].similarity(doc[1])) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/vectors_tensorboard.py b/examples/vectors_tensorboard.py deleted file mode 100644 index 72eda1edc..000000000 --- a/examples/vectors_tensorboard.py +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Visualize spaCy word vectors in Tensorboard. - -Adapted from: https://gist.github.com/BrikerMan/7bd4e4bd0a00ac9076986148afc06507 -""" -from __future__ import unicode_literals - -from os import path - -import tqdm -import math -import numpy -import plac -import spacy -import tensorflow as tf -from tensorflow.contrib.tensorboard.plugins.projector import ( - visualize_embeddings, - ProjectorConfig, -) - - -@plac.annotations( - vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str), - out_loc=( - "Path to output folder for tensorboard session data", - "positional", - None, - str, - ), - name=( - "Human readable name for tsv file and vectors tensor", - "positional", - None, - str, - ), -) -def main(vectors_loc, out_loc, name="spaCy_vectors"): - meta_file = "{}.tsv".format(name) - out_meta_file = path.join(out_loc, meta_file) - - print("Loading spaCy vectors model: {}".format(vectors_loc)) - model = spacy.load(vectors_loc) - print("Finding lexemes with vectors attached: {}".format(vectors_loc)) - strings_stream = tqdm.tqdm( - model.vocab.strings, total=len(model.vocab.strings), leave=False - ) - queries = [w for w in strings_stream if model.vocab.has_vector(w)] - vector_count = len(queries) - - print( - "Building Tensorboard Projector metadata for ({}) vectors: {}".format( - vector_count, out_meta_file - ) - ) - - # Store vector data in a tensorflow variable - tf_vectors_variable = numpy.zeros((vector_count, model.vocab.vectors.shape[1])) - - # Write a tab-separated file that contains information about the vectors for visualization - # - # Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata - with open(out_meta_file, "wb") as file_metadata: - # Define columns in the first row - file_metadata.write("Text\tFrequency\n".encode("utf-8")) - # Write out a row for each vector that we add to the tensorflow variable we created - vec_index = 0 - for text in tqdm.tqdm(queries, total=len(queries), leave=False): - # https://github.com/tensorflow/tensorflow/issues/9094 - text = "" if text.lstrip() == "" else text - lex = model.vocab[text] - - # Store vector data and metadata - tf_vectors_variable[vec_index] = model.vocab.get_vector(text) - file_metadata.write( - "{}\t{}\n".format(text, math.exp(lex.prob) * vector_count).encode( - "utf-8" - ) - ) - vec_index += 1 - - print("Running Tensorflow Session...") - sess = tf.InteractiveSession() - tf.Variable(tf_vectors_variable, trainable=False, name=name) - tf.global_variables_initializer().run() - saver = tf.train.Saver() - writer = tf.summary.FileWriter(out_loc, sess.graph) - - # Link the embeddings into the config - config = ProjectorConfig() - embed = config.embeddings.add() - embed.tensor_name = name - embed.metadata_path = meta_file - - # Tell the projector about the configured embeddings and metadata file - visualize_embeddings(writer, config) - - # Save session and print run command to the output - print("Saving Tensorboard Session...") - saver.save(sess, path.join(out_loc, "{}.ckpt".format(name))) - print("Done. Run `tensorboard --logdir={0}` to view in Tensorboard".format(out_loc)) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/training/ner_example_data/README.md b/extra/example_data/ner_example_data/README.md similarity index 100% rename from examples/training/ner_example_data/README.md rename to extra/example_data/ner_example_data/README.md diff --git a/examples/training/ner_example_data/ner-sent-per-line.iob b/extra/example_data/ner_example_data/ner-sent-per-line.iob similarity index 100% rename from examples/training/ner_example_data/ner-sent-per-line.iob rename to extra/example_data/ner_example_data/ner-sent-per-line.iob diff --git a/examples/training/ner_example_data/ner-sent-per-line.json b/extra/example_data/ner_example_data/ner-sent-per-line.json similarity index 100% rename from examples/training/ner_example_data/ner-sent-per-line.json rename to extra/example_data/ner_example_data/ner-sent-per-line.json diff --git a/examples/training/ner_example_data/ner-token-per-line-conll2003.iob b/extra/example_data/ner_example_data/ner-token-per-line-conll2003.iob similarity index 100% rename from examples/training/ner_example_data/ner-token-per-line-conll2003.iob rename to extra/example_data/ner_example_data/ner-token-per-line-conll2003.iob diff --git a/examples/training/ner_example_data/ner-token-per-line-conll2003.json b/extra/example_data/ner_example_data/ner-token-per-line-conll2003.json similarity index 100% rename from examples/training/ner_example_data/ner-token-per-line-conll2003.json rename to extra/example_data/ner_example_data/ner-token-per-line-conll2003.json diff --git a/examples/training/ner_example_data/ner-token-per-line-with-pos.iob b/extra/example_data/ner_example_data/ner-token-per-line-with-pos.iob similarity index 100% rename from examples/training/ner_example_data/ner-token-per-line-with-pos.iob rename to extra/example_data/ner_example_data/ner-token-per-line-with-pos.iob diff --git a/examples/training/ner_example_data/ner-token-per-line-with-pos.json b/extra/example_data/ner_example_data/ner-token-per-line-with-pos.json similarity index 100% rename from examples/training/ner_example_data/ner-token-per-line-with-pos.json rename to extra/example_data/ner_example_data/ner-token-per-line-with-pos.json diff --git a/examples/training/ner_example_data/ner-token-per-line.iob b/extra/example_data/ner_example_data/ner-token-per-line.iob similarity index 100% rename from examples/training/ner_example_data/ner-token-per-line.iob rename to extra/example_data/ner_example_data/ner-token-per-line.iob diff --git a/examples/training/ner_example_data/ner-token-per-line.json b/extra/example_data/ner_example_data/ner-token-per-line.json similarity index 100% rename from examples/training/ner_example_data/ner-token-per-line.json rename to extra/example_data/ner_example_data/ner-token-per-line.json diff --git a/examples/training/textcat_example_data/CC0.txt b/extra/example_data/textcat_example_data/CC0.txt similarity index 100% rename from examples/training/textcat_example_data/CC0.txt rename to extra/example_data/textcat_example_data/CC0.txt diff --git a/examples/training/textcat_example_data/CC_BY-SA-3.0.txt b/extra/example_data/textcat_example_data/CC_BY-SA-3.0.txt similarity index 100% rename from examples/training/textcat_example_data/CC_BY-SA-3.0.txt rename to extra/example_data/textcat_example_data/CC_BY-SA-3.0.txt diff --git a/examples/training/textcat_example_data/CC_BY-SA-4.0.txt b/extra/example_data/textcat_example_data/CC_BY-SA-4.0.txt similarity index 100% rename from examples/training/textcat_example_data/CC_BY-SA-4.0.txt rename to extra/example_data/textcat_example_data/CC_BY-SA-4.0.txt diff --git a/examples/training/textcat_example_data/README.md b/extra/example_data/textcat_example_data/README.md similarity index 100% rename from examples/training/textcat_example_data/README.md rename to extra/example_data/textcat_example_data/README.md diff --git a/examples/training/textcat_example_data/cooking.json b/extra/example_data/textcat_example_data/cooking.json similarity index 100% rename from examples/training/textcat_example_data/cooking.json rename to extra/example_data/textcat_example_data/cooking.json diff --git a/examples/training/textcat_example_data/cooking.jsonl b/extra/example_data/textcat_example_data/cooking.jsonl similarity index 100% rename from examples/training/textcat_example_data/cooking.jsonl rename to extra/example_data/textcat_example_data/cooking.jsonl diff --git a/examples/training/textcat_example_data/jigsaw-toxic-comment.json b/extra/example_data/textcat_example_data/jigsaw-toxic-comment.json similarity index 100% rename from examples/training/textcat_example_data/jigsaw-toxic-comment.json rename to extra/example_data/textcat_example_data/jigsaw-toxic-comment.json diff --git a/examples/training/textcat_example_data/jigsaw-toxic-comment.jsonl b/extra/example_data/textcat_example_data/jigsaw-toxic-comment.jsonl similarity index 100% rename from examples/training/textcat_example_data/jigsaw-toxic-comment.jsonl rename to extra/example_data/textcat_example_data/jigsaw-toxic-comment.jsonl diff --git a/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py b/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py similarity index 100% rename from examples/training/textcat_example_data/textcatjsonl_to_trainjson.py rename to extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py diff --git a/examples/training/training-data.json b/extra/example_data/training-data.json similarity index 100% rename from examples/training/training-data.json rename to extra/example_data/training-data.json diff --git a/examples/training/vocab-data.jsonl b/extra/example_data/vocab-data.jsonl similarity index 100% rename from examples/training/vocab-data.jsonl rename to extra/example_data/vocab-data.jsonl diff --git a/examples/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg similarity index 100% rename from examples/experiments/onto-joint/defaults.cfg rename to extra/experiments/onto-joint/defaults.cfg diff --git a/examples/experiments/onto-joint/pretrain.cfg b/extra/experiments/onto-joint/pretrain.cfg similarity index 100% rename from examples/experiments/onto-joint/pretrain.cfg rename to extra/experiments/onto-joint/pretrain.cfg diff --git a/examples/experiments/onto-ner.cfg b/extra/experiments/onto-ner.cfg similarity index 100% rename from examples/experiments/onto-ner.cfg rename to extra/experiments/onto-ner.cfg diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/extra/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg similarity index 100% rename from examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg rename to extra/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg similarity index 100% rename from examples/experiments/ptb-joint-pos-dep/defaults.cfg rename to extra/experiments/ptb-joint-pos-dep/defaults.cfg diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/extra/experiments/tok2vec-ner/charembed_tok2vec.cfg similarity index 100% rename from examples/experiments/tok2vec-ner/charembed_tok2vec.cfg rename to extra/experiments/tok2vec-ner/charembed_tok2vec.cfg diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg similarity index 100% rename from examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg rename to extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg diff --git a/fabfile.py b/fabfile.py deleted file mode 100644 index 760c2c0e2..000000000 --- a/fabfile.py +++ /dev/null @@ -1,149 +0,0 @@ -import contextlib -from pathlib import Path -from fabric.api import local, lcd -from os import path, environ -import shutil -import sys - - -PWD = path.dirname(__file__) -ENV = environ["VENV_DIR"] if "VENV_DIR" in environ else ".env" -VENV_DIR = Path(PWD) / ENV - - -@contextlib.contextmanager -def virtualenv(name, create=False, python="/usr/bin/python3.6"): - python = Path(python).resolve() - env_path = VENV_DIR - if create: - if env_path.exists(): - shutil.rmtree(str(env_path)) - local("{python} -m venv {env_path}".format(python=python, env_path=VENV_DIR)) - - def wrapped_local(cmd, env_vars=[], capture=False, direct=False): - return local( - "source {}/bin/activate && {}".format(env_path, cmd), - shell="/bin/bash", - capture=False, - ) - - yield wrapped_local - - -def env(lang="python3.6"): - if VENV_DIR.exists(): - local("rm -rf {env}".format(env=VENV_DIR)) - if lang.startswith("python3"): - local("{lang} -m venv {env}".format(lang=lang, env=VENV_DIR)) - else: - local("{lang} -m pip install virtualenv --no-cache-dir".format(lang=lang)) - local( - "{lang} -m virtualenv {env} --no-cache-dir".format(lang=lang, env=VENV_DIR) - ) - with virtualenv(VENV_DIR) as venv_local: - print(venv_local("python --version", capture=True)) - venv_local("pip install --upgrade setuptools --no-cache-dir") - venv_local("pip install pytest --no-cache-dir") - venv_local("pip install wheel --no-cache-dir") - venv_local("pip install -r requirements.txt --no-cache-dir") - venv_local("pip install pex --no-cache-dir") - - -def install(): - with virtualenv(VENV_DIR) as venv_local: - venv_local("pip install dist/*.tar.gz") - - -def make(): - with lcd(path.dirname(__file__)): - local( - "export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace", - shell="/bin/bash", - ) - - -def sdist(): - with virtualenv(VENV_DIR) as venv_local: - with lcd(path.dirname(__file__)): - venv_local("python -m pip install -U setuptools srsly") - venv_local("python setup.py sdist") - - -def wheel(): - with virtualenv(VENV_DIR) as venv_local: - with lcd(path.dirname(__file__)): - venv_local("python setup.py bdist_wheel") - - -def pex(): - with virtualenv(VENV_DIR) as venv_local: - with lcd(path.dirname(__file__)): - sha = local("git rev-parse --short HEAD", capture=True) - venv_local(f"pex dist/*.whl -e spacy -o dist/spacy-{sha}.pex", direct=True) - - -def clean(): - with lcd(path.dirname(__file__)): - local("rm -f dist/*.whl") - local("rm -f dist/*.pex") - with virtualenv(VENV_DIR) as venv_local: - venv_local("python setup.py clean --all") - - -def test(): - with virtualenv(VENV_DIR) as venv_local: - with lcd(path.dirname(__file__)): - venv_local("pytest -x spacy/tests") - - -def train(): - args = environ.get("SPACY_TRAIN_ARGS", "") - with virtualenv(VENV_DIR) as venv_local: - venv_local("spacy train {args}".format(args=args)) - - -def conll17(treebank_dir, experiment_dir, vectors_dir, config, corpus=""): - is_not_clean = local("git status --porcelain", capture=True) - if is_not_clean: - print("Repository is not clean") - print(is_not_clean) - sys.exit(1) - git_sha = local("git rev-parse --short HEAD", capture=True) - config_checksum = local("sha256sum {config}".format(config=config), capture=True) - experiment_dir = Path(experiment_dir) / "{}--{}".format( - config_checksum[:6], git_sha - ) - if not experiment_dir.exists(): - experiment_dir.mkdir() - test_data_dir = Path(treebank_dir) / "ud-test-v2.0-conll2017" - assert test_data_dir.exists() - assert test_data_dir.is_dir() - if corpus: - corpora = [corpus] - else: - corpora = ["UD_English", "UD_Chinese", "UD_Japanese", "UD_Vietnamese"] - - local( - "cp {config} {experiment_dir}/config.json".format( - config=config, experiment_dir=experiment_dir - ) - ) - with virtualenv(VENV_DIR) as venv_local: - for corpus in corpora: - venv_local( - "spacy ud-train {treebank_dir} {experiment_dir} {config} {corpus} -v {vectors_dir}".format( - treebank_dir=treebank_dir, - experiment_dir=experiment_dir, - config=config, - corpus=corpus, - vectors_dir=vectors_dir, - ) - ) - venv_local( - "spacy ud-run-test {test_data_dir} {experiment_dir} {corpus}".format( - test_data_dir=test_data_dir, - experiment_dir=experiment_dir, - config=config, - corpus=corpus, - ) - ) diff --git a/include/msvc9/stdint.h b/include/msvc9/stdint.h deleted file mode 100644 index 4fe0ef9a9..000000000 --- a/include/msvc9/stdint.h +++ /dev/null @@ -1,259 +0,0 @@ -// ISO C9x compliant stdint.h for Microsoft Visual Studio -// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 -// -// Copyright (c) 2006-2013 Alexander Chemeris -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the product nor the names of its contributors may -// be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -/////////////////////////////////////////////////////////////////////////////// - -#ifndef _MSC_VER // [ -#error "Use this header only with Microsoft Visual C++ compilers!" -#endif // _MSC_VER ] - -#ifndef _MSC_STDINT_H_ // [ -#define _MSC_STDINT_H_ - -#if _MSC_VER > 1000 -#pragma once -#endif - -#if _MSC_VER >= 1600 // [ -#include -#else // ] _MSC_VER >= 1600 [ - -#include - -// For Visual Studio 6 in C++ mode and for many Visual Studio versions when -// compiling for ARM we should wrap include with 'extern "C++" {}' -// or compiler give many errors like this: -// error C2733: second C linkage of overloaded function 'wmemchr' not allowed -#ifdef __cplusplus -extern "C" { -#endif -# include -#ifdef __cplusplus -} -#endif - -// Define _W64 macros to mark types changing their size, like intptr_t. -#ifndef _W64 -# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 -# define _W64 __w64 -# else -# define _W64 -# endif -#endif - - -// 7.18.1 Integer types - -// 7.18.1.1 Exact-width integer types - -// Visual Studio 6 and Embedded Visual C++ 4 doesn't -// realize that, e.g. char has the same size as __int8 -// so we give up on __intX for them. -#if (_MSC_VER < 1300) - typedef signed char int8_t; - typedef signed short int16_t; - typedef signed int int32_t; - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; -#else - typedef signed __int8 int8_t; - typedef signed __int16 int16_t; - typedef signed __int32 int32_t; - typedef unsigned __int8 uint8_t; - typedef unsigned __int16 uint16_t; - typedef unsigned __int32 uint32_t; -#endif -typedef signed __int64 int64_t; -typedef unsigned __int64 uint64_t; - - -// 7.18.1.2 Minimum-width integer types -typedef int8_t int_least8_t; -typedef int16_t int_least16_t; -typedef int32_t int_least32_t; -typedef int64_t int_least64_t; -typedef uint8_t uint_least8_t; -typedef uint16_t uint_least16_t; -typedef uint32_t uint_least32_t; -typedef uint64_t uint_least64_t; - -// 7.18.1.3 Fastest minimum-width integer types -typedef int8_t int_fast8_t; -typedef int16_t int_fast16_t; -typedef int32_t int_fast32_t; -typedef int64_t int_fast64_t; -typedef uint8_t uint_fast8_t; -typedef uint16_t uint_fast16_t; -typedef uint32_t uint_fast32_t; -typedef uint64_t uint_fast64_t; - -// 7.18.1.4 Integer types capable of holding object pointers -#ifdef _WIN64 // [ - typedef signed __int64 intptr_t; - typedef unsigned __int64 uintptr_t; -#else // _WIN64 ][ - typedef _W64 signed int intptr_t; - typedef _W64 unsigned int uintptr_t; -#endif // _WIN64 ] - -// 7.18.1.5 Greatest-width integer types -typedef int64_t intmax_t; -typedef uint64_t uintmax_t; - - -// 7.18.2 Limits of specified-width integer types - -#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 - -// 7.18.2.1 Limits of exact-width integer types -#define INT8_MIN ((int8_t)_I8_MIN) -#define INT8_MAX _I8_MAX -#define INT16_MIN ((int16_t)_I16_MIN) -#define INT16_MAX _I16_MAX -#define INT32_MIN ((int32_t)_I32_MIN) -#define INT32_MAX _I32_MAX -#define INT64_MIN ((int64_t)_I64_MIN) -#define INT64_MAX _I64_MAX -#define UINT8_MAX _UI8_MAX -#define UINT16_MAX _UI16_MAX -#define UINT32_MAX _UI32_MAX -#define UINT64_MAX _UI64_MAX - -// 7.18.2.2 Limits of minimum-width integer types -#define INT_LEAST8_MIN INT8_MIN -#define INT_LEAST8_MAX INT8_MAX -#define INT_LEAST16_MIN INT16_MIN -#define INT_LEAST16_MAX INT16_MAX -#define INT_LEAST32_MIN INT32_MIN -#define INT_LEAST32_MAX INT32_MAX -#define INT_LEAST64_MIN INT64_MIN -#define INT_LEAST64_MAX INT64_MAX -#define UINT_LEAST8_MAX UINT8_MAX -#define UINT_LEAST16_MAX UINT16_MAX -#define UINT_LEAST32_MAX UINT32_MAX -#define UINT_LEAST64_MAX UINT64_MAX - -// 7.18.2.3 Limits of fastest minimum-width integer types -#define INT_FAST8_MIN INT8_MIN -#define INT_FAST8_MAX INT8_MAX -#define INT_FAST16_MIN INT16_MIN -#define INT_FAST16_MAX INT16_MAX -#define INT_FAST32_MIN INT32_MIN -#define INT_FAST32_MAX INT32_MAX -#define INT_FAST64_MIN INT64_MIN -#define INT_FAST64_MAX INT64_MAX -#define UINT_FAST8_MAX UINT8_MAX -#define UINT_FAST16_MAX UINT16_MAX -#define UINT_FAST32_MAX UINT32_MAX -#define UINT_FAST64_MAX UINT64_MAX - -// 7.18.2.4 Limits of integer types capable of holding object pointers -#ifdef _WIN64 // [ -# define INTPTR_MIN INT64_MIN -# define INTPTR_MAX INT64_MAX -# define UINTPTR_MAX UINT64_MAX -#else // _WIN64 ][ -# define INTPTR_MIN INT32_MIN -# define INTPTR_MAX INT32_MAX -# define UINTPTR_MAX UINT32_MAX -#endif // _WIN64 ] - -// 7.18.2.5 Limits of greatest-width integer types -#define INTMAX_MIN INT64_MIN -#define INTMAX_MAX INT64_MAX -#define UINTMAX_MAX UINT64_MAX - -// 7.18.3 Limits of other integer types - -#ifdef _WIN64 // [ -# define PTRDIFF_MIN _I64_MIN -# define PTRDIFF_MAX _I64_MAX -#else // _WIN64 ][ -# define PTRDIFF_MIN _I32_MIN -# define PTRDIFF_MAX _I32_MAX -#endif // _WIN64 ] - -#define SIG_ATOMIC_MIN INT_MIN -#define SIG_ATOMIC_MAX INT_MAX - -#ifndef SIZE_MAX // [ -# ifdef _WIN64 // [ -# define SIZE_MAX _UI64_MAX -# else // _WIN64 ][ -# define SIZE_MAX _UI32_MAX -# endif // _WIN64 ] -#endif // SIZE_MAX ] - -// WCHAR_MIN and WCHAR_MAX are also defined in -#ifndef WCHAR_MIN // [ -# define WCHAR_MIN 0 -#endif // WCHAR_MIN ] -#ifndef WCHAR_MAX // [ -# define WCHAR_MAX _UI16_MAX -#endif // WCHAR_MAX ] - -#define WINT_MIN 0 -#define WINT_MAX _UI16_MAX - -#endif // __STDC_LIMIT_MACROS ] - - -// 7.18.4 Limits of other integer types - -#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 - -// 7.18.4.1 Macros for minimum-width integer constants - -#define INT8_C(val) val##i8 -#define INT16_C(val) val##i16 -#define INT32_C(val) val##i32 -#define INT64_C(val) val##i64 - -#define UINT8_C(val) val##ui8 -#define UINT16_C(val) val##ui16 -#define UINT32_C(val) val##ui32 -#define UINT64_C(val) val##ui64 - -// 7.18.4.2 Macros for greatest-width integer constants -// These #ifndef's are needed to prevent collisions with . -// Check out Issue 9 for the details. -#ifndef INTMAX_C // [ -# define INTMAX_C INT64_C -#endif // INTMAX_C ] -#ifndef UINTMAX_C // [ -# define UINTMAX_C UINT64_C -#endif // UINTMAX_C ] - -#endif // __STDC_CONSTANT_MACROS ] - -#endif // _MSC_VER >= 1600 ] - -#endif // _MSC_STDINT_H_ ] diff --git a/include/murmurhash/MurmurHash2.h b/include/murmurhash/MurmurHash2.h deleted file mode 100644 index 6d7ccf4b2..000000000 --- a/include/murmurhash/MurmurHash2.h +++ /dev/null @@ -1,22 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash2 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -#ifndef _MURMURHASH2_H_ -#define _MURMURHASH2_H_ - -#include - -//----------------------------------------------------------------------------- - -uint32_t MurmurHash2 ( const void * key, int len, uint32_t seed ); -uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed ); -uint64_t MurmurHash64B ( const void * key, int len, uint64_t seed ); -uint32_t MurmurHash2A ( const void * key, int len, uint32_t seed ); -uint32_t MurmurHashNeutral2 ( const void * key, int len, uint32_t seed ); -uint32_t MurmurHashAligned2 ( const void * key, int len, uint32_t seed ); - -//----------------------------------------------------------------------------- - -#endif // _MURMURHASH2_H_ - diff --git a/include/murmurhash/MurmurHash3.h b/include/murmurhash/MurmurHash3.h deleted file mode 100644 index 9b4c3c90b..000000000 --- a/include/murmurhash/MurmurHash3.h +++ /dev/null @@ -1,28 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -#ifndef _MURMURHASH3_H_ -#define _MURMURHASH3_H_ - -#include - -//----------------------------------------------------------------------------- -#ifdef __cplusplus -extern "C" { -#endif - - -void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); - -void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); - -void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); - -#ifdef __cplusplus -} -#endif - -//----------------------------------------------------------------------------- - -#endif // _MURMURHASH3_H_ diff --git a/include/numpy/__multiarray_api.h b/include/numpy/__multiarray_api.h deleted file mode 100644 index c949d732f..000000000 --- a/include/numpy/__multiarray_api.h +++ /dev/null @@ -1,1686 +0,0 @@ - -#ifdef _MULTIARRAYMODULE - -typedef struct { - PyObject_HEAD - npy_bool obval; -} PyBoolScalarObject; - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION -extern NPY_NO_EXPORT PyTypeObject PyArrayMapIter_Type; -extern NPY_NO_EXPORT PyTypeObject PyArrayNeighborhoodIter_Type; -extern NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2]; -#else -NPY_NO_EXPORT PyTypeObject PyArrayMapIter_Type; -NPY_NO_EXPORT PyTypeObject PyArrayNeighborhoodIter_Type; -NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2]; -#endif - -NPY_NO_EXPORT unsigned int PyArray_GetNDArrayCVersion \ - (void); -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyBigArray_Type; -#else - NPY_NO_EXPORT PyTypeObject PyBigArray_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyArray_Type; -#else - NPY_NO_EXPORT PyTypeObject PyArray_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyArrayDescr_Type; -#else - NPY_NO_EXPORT PyTypeObject PyArrayDescr_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyArrayFlags_Type; -#else - NPY_NO_EXPORT PyTypeObject PyArrayFlags_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyArrayIter_Type; -#else - NPY_NO_EXPORT PyTypeObject PyArrayIter_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyArrayMultiIter_Type; -#else - NPY_NO_EXPORT PyTypeObject PyArrayMultiIter_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT int NPY_NUMUSERTYPES; -#else - NPY_NO_EXPORT int NPY_NUMUSERTYPES; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyBoolArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyBoolArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION -extern NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2]; -#else -NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2]; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyGenericArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyGenericArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyNumberArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyNumberArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyIntegerArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyIntegerArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PySignedIntegerArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PySignedIntegerArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyUnsignedIntegerArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyUnsignedIntegerArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyInexactArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyInexactArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyFloatingArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyFloatingArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyComplexFloatingArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyComplexFloatingArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyFlexibleArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyFlexibleArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyCharacterArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyCharacterArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyByteArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyByteArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyShortArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyShortArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyIntArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyIntArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyLongArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyLongArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyLongLongArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyLongLongArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyUByteArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyUByteArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyUShortArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyUShortArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyUIntArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyUIntArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyULongArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyULongArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyULongLongArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyULongLongArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyFloatArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyFloatArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyDoubleArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyDoubleArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyLongDoubleArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyLongDoubleArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyCFloatArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyCFloatArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyCDoubleArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyCDoubleArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyCLongDoubleArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyCLongDoubleArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyObjectArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyObjectArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyStringArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyStringArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyUnicodeArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyUnicodeArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyVoidArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyVoidArrType_Type; -#endif - -NPY_NO_EXPORT int PyArray_SetNumericOps \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_GetNumericOps \ - (void); -NPY_NO_EXPORT int PyArray_INCREF \ - (PyArrayObject *); -NPY_NO_EXPORT int PyArray_XDECREF \ - (PyArrayObject *); -NPY_NO_EXPORT void PyArray_SetStringFunction \ - (PyObject *, int); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrFromType \ - (int); -NPY_NO_EXPORT PyObject * PyArray_TypeObjectFromType \ - (int); -NPY_NO_EXPORT char * PyArray_Zero \ - (PyArrayObject *); -NPY_NO_EXPORT char * PyArray_One \ - (PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_CastToType \ - (PyArrayObject *, PyArray_Descr *, int); -NPY_NO_EXPORT int PyArray_CastTo \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT int PyArray_CastAnyTo \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT int PyArray_CanCastSafely \ - (int, int); -NPY_NO_EXPORT npy_bool PyArray_CanCastTo \ - (PyArray_Descr *, PyArray_Descr *); -NPY_NO_EXPORT int PyArray_ObjectType \ - (PyObject *, int); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrFromObject \ - (PyObject *, PyArray_Descr *); -NPY_NO_EXPORT PyArrayObject ** PyArray_ConvertToCommonType \ - (PyObject *, int *); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrFromScalar \ - (PyObject *); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrFromTypeObject \ - (PyObject *); -NPY_NO_EXPORT npy_intp PyArray_Size \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_Scalar \ - (void *, PyArray_Descr *, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_FromScalar \ - (PyObject *, PyArray_Descr *); -NPY_NO_EXPORT void PyArray_ScalarAsCtype \ - (PyObject *, void *); -NPY_NO_EXPORT int PyArray_CastScalarToCtype \ - (PyObject *, void *, PyArray_Descr *); -NPY_NO_EXPORT int PyArray_CastScalarDirect \ - (PyObject *, PyArray_Descr *, void *, int); -NPY_NO_EXPORT PyObject * PyArray_ScalarFromObject \ - (PyObject *); -NPY_NO_EXPORT PyArray_VectorUnaryFunc * PyArray_GetCastFunc \ - (PyArray_Descr *, int); -NPY_NO_EXPORT PyObject * PyArray_FromDims \ - (int, int *, int); -NPY_NO_EXPORT PyObject * PyArray_FromDimsAndDataAndDescr \ - (int, int *, PyArray_Descr *, char *); -NPY_NO_EXPORT PyObject * PyArray_FromAny \ - (PyObject *, PyArray_Descr *, int, int, int, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_EnsureArray \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_EnsureAnyArray \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_FromFile \ - (FILE *, PyArray_Descr *, npy_intp, char *); -NPY_NO_EXPORT PyObject * PyArray_FromString \ - (char *, npy_intp, PyArray_Descr *, npy_intp, char *); -NPY_NO_EXPORT PyObject * PyArray_FromBuffer \ - (PyObject *, PyArray_Descr *, npy_intp, npy_intp); -NPY_NO_EXPORT PyObject * PyArray_FromIter \ - (PyObject *, PyArray_Descr *, npy_intp); -NPY_NO_EXPORT PyObject * PyArray_Return \ - (PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_GetField \ - (PyArrayObject *, PyArray_Descr *, int); -NPY_NO_EXPORT int PyArray_SetField \ - (PyArrayObject *, PyArray_Descr *, int, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_Byteswap \ - (PyArrayObject *, npy_bool); -NPY_NO_EXPORT PyObject * PyArray_Resize \ - (PyArrayObject *, PyArray_Dims *, int, NPY_ORDER); -NPY_NO_EXPORT int PyArray_MoveInto \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT int PyArray_CopyInto \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT int PyArray_CopyAnyInto \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT int PyArray_CopyObject \ - (PyArrayObject *, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_NewCopy \ - (PyArrayObject *, NPY_ORDER); -NPY_NO_EXPORT PyObject * PyArray_ToList \ - (PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_ToString \ - (PyArrayObject *, NPY_ORDER); -NPY_NO_EXPORT int PyArray_ToFile \ - (PyArrayObject *, FILE *, char *, char *); -NPY_NO_EXPORT int PyArray_Dump \ - (PyObject *, PyObject *, int); -NPY_NO_EXPORT PyObject * PyArray_Dumps \ - (PyObject *, int); -NPY_NO_EXPORT int PyArray_ValidType \ - (int); -NPY_NO_EXPORT void PyArray_UpdateFlags \ - (PyArrayObject *, int); -NPY_NO_EXPORT PyObject * PyArray_New \ - (PyTypeObject *, int, npy_intp *, int, npy_intp *, void *, int, int, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_NewFromDescr \ - (PyTypeObject *, PyArray_Descr *, int, npy_intp *, npy_intp *, void *, int, PyObject *); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrNew \ - (PyArray_Descr *); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrNewFromType \ - (int); -NPY_NO_EXPORT double PyArray_GetPriority \ - (PyObject *, double); -NPY_NO_EXPORT PyObject * PyArray_IterNew \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_MultiIterNew \ - (int, ...); -NPY_NO_EXPORT int PyArray_PyIntAsInt \ - (PyObject *); -NPY_NO_EXPORT npy_intp PyArray_PyIntAsIntp \ - (PyObject *); -NPY_NO_EXPORT int PyArray_Broadcast \ - (PyArrayMultiIterObject *); -NPY_NO_EXPORT void PyArray_FillObjectArray \ - (PyArrayObject *, PyObject *); -NPY_NO_EXPORT int PyArray_FillWithScalar \ - (PyArrayObject *, PyObject *); -NPY_NO_EXPORT npy_bool PyArray_CheckStrides \ - (int, int, npy_intp, npy_intp, npy_intp *, npy_intp *); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrNewByteorder \ - (PyArray_Descr *, char); -NPY_NO_EXPORT PyObject * PyArray_IterAllButAxis \ - (PyObject *, int *); -NPY_NO_EXPORT PyObject * PyArray_CheckFromAny \ - (PyObject *, PyArray_Descr *, int, int, int, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_FromArray \ - (PyArrayObject *, PyArray_Descr *, int); -NPY_NO_EXPORT PyObject * PyArray_FromInterface \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_FromStructInterface \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_FromArrayAttr \ - (PyObject *, PyArray_Descr *, PyObject *); -NPY_NO_EXPORT NPY_SCALARKIND PyArray_ScalarKind \ - (int, PyArrayObject **); -NPY_NO_EXPORT int PyArray_CanCoerceScalar \ - (int, int, NPY_SCALARKIND); -NPY_NO_EXPORT PyObject * PyArray_NewFlagsObject \ - (PyObject *); -NPY_NO_EXPORT npy_bool PyArray_CanCastScalar \ - (PyTypeObject *, PyTypeObject *); -NPY_NO_EXPORT int PyArray_CompareUCS4 \ - (npy_ucs4 *, npy_ucs4 *, size_t); -NPY_NO_EXPORT int PyArray_RemoveSmallest \ - (PyArrayMultiIterObject *); -NPY_NO_EXPORT int PyArray_ElementStrides \ - (PyObject *); -NPY_NO_EXPORT void PyArray_Item_INCREF \ - (char *, PyArray_Descr *); -NPY_NO_EXPORT void PyArray_Item_XDECREF \ - (char *, PyArray_Descr *); -NPY_NO_EXPORT PyObject * PyArray_FieldNames \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_Transpose \ - (PyArrayObject *, PyArray_Dims *); -NPY_NO_EXPORT PyObject * PyArray_TakeFrom \ - (PyArrayObject *, PyObject *, int, PyArrayObject *, NPY_CLIPMODE); -NPY_NO_EXPORT PyObject * PyArray_PutTo \ - (PyArrayObject *, PyObject*, PyObject *, NPY_CLIPMODE); -NPY_NO_EXPORT PyObject * PyArray_PutMask \ - (PyArrayObject *, PyObject*, PyObject*); -NPY_NO_EXPORT PyObject * PyArray_Repeat \ - (PyArrayObject *, PyObject *, int); -NPY_NO_EXPORT PyObject * PyArray_Choose \ - (PyArrayObject *, PyObject *, PyArrayObject *, NPY_CLIPMODE); -NPY_NO_EXPORT int PyArray_Sort \ - (PyArrayObject *, int, NPY_SORTKIND); -NPY_NO_EXPORT PyObject * PyArray_ArgSort \ - (PyArrayObject *, int, NPY_SORTKIND); -NPY_NO_EXPORT PyObject * PyArray_SearchSorted \ - (PyArrayObject *, PyObject *, NPY_SEARCHSIDE, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_ArgMax \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_ArgMin \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Reshape \ - (PyArrayObject *, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_Newshape \ - (PyArrayObject *, PyArray_Dims *, NPY_ORDER); -NPY_NO_EXPORT PyObject * PyArray_Squeeze \ - (PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_View \ - (PyArrayObject *, PyArray_Descr *, PyTypeObject *); -NPY_NO_EXPORT PyObject * PyArray_SwapAxes \ - (PyArrayObject *, int, int); -NPY_NO_EXPORT PyObject * PyArray_Max \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Min \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Ptp \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Mean \ - (PyArrayObject *, int, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Trace \ - (PyArrayObject *, int, int, int, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Diagonal \ - (PyArrayObject *, int, int, int); -NPY_NO_EXPORT PyObject * PyArray_Clip \ - (PyArrayObject *, PyObject *, PyObject *, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Conjugate \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Nonzero \ - (PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Std \ - (PyArrayObject *, int, int, PyArrayObject *, int); -NPY_NO_EXPORT PyObject * PyArray_Sum \ - (PyArrayObject *, int, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_CumSum \ - (PyArrayObject *, int, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Prod \ - (PyArrayObject *, int, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_CumProd \ - (PyArrayObject *, int, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_All \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Any \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Compress \ - (PyArrayObject *, PyObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Flatten \ - (PyArrayObject *, NPY_ORDER); -NPY_NO_EXPORT PyObject * PyArray_Ravel \ - (PyArrayObject *, NPY_ORDER); -NPY_NO_EXPORT npy_intp PyArray_MultiplyList \ - (npy_intp *, int); -NPY_NO_EXPORT int PyArray_MultiplyIntList \ - (int *, int); -NPY_NO_EXPORT void * PyArray_GetPtr \ - (PyArrayObject *, npy_intp*); -NPY_NO_EXPORT int PyArray_CompareLists \ - (npy_intp *, npy_intp *, int); -NPY_NO_EXPORT int PyArray_AsCArray \ - (PyObject **, void *, npy_intp *, int, PyArray_Descr*); -NPY_NO_EXPORT int PyArray_As1D \ - (PyObject **, char **, int *, int); -NPY_NO_EXPORT int PyArray_As2D \ - (PyObject **, char ***, int *, int *, int); -NPY_NO_EXPORT int PyArray_Free \ - (PyObject *, void *); -NPY_NO_EXPORT int PyArray_Converter \ - (PyObject *, PyObject **); -NPY_NO_EXPORT int PyArray_IntpFromSequence \ - (PyObject *, npy_intp *, int); -NPY_NO_EXPORT PyObject * PyArray_Concatenate \ - (PyObject *, int); -NPY_NO_EXPORT PyObject * PyArray_InnerProduct \ - (PyObject *, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_MatrixProduct \ - (PyObject *, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_CopyAndTranspose \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_Correlate \ - (PyObject *, PyObject *, int); -NPY_NO_EXPORT int PyArray_TypestrConvert \ - (int, int); -NPY_NO_EXPORT int PyArray_DescrConverter \ - (PyObject *, PyArray_Descr **); -NPY_NO_EXPORT int PyArray_DescrConverter2 \ - (PyObject *, PyArray_Descr **); -NPY_NO_EXPORT int PyArray_IntpConverter \ - (PyObject *, PyArray_Dims *); -NPY_NO_EXPORT int PyArray_BufferConverter \ - (PyObject *, PyArray_Chunk *); -NPY_NO_EXPORT int PyArray_AxisConverter \ - (PyObject *, int *); -NPY_NO_EXPORT int PyArray_BoolConverter \ - (PyObject *, npy_bool *); -NPY_NO_EXPORT int PyArray_ByteorderConverter \ - (PyObject *, char *); -NPY_NO_EXPORT int PyArray_OrderConverter \ - (PyObject *, NPY_ORDER *); -NPY_NO_EXPORT unsigned char PyArray_EquivTypes \ - (PyArray_Descr *, PyArray_Descr *); -NPY_NO_EXPORT PyObject * PyArray_Zeros \ - (int, npy_intp *, PyArray_Descr *, int); -NPY_NO_EXPORT PyObject * PyArray_Empty \ - (int, npy_intp *, PyArray_Descr *, int); -NPY_NO_EXPORT PyObject * PyArray_Where \ - (PyObject *, PyObject *, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_Arange \ - (double, double, double, int); -NPY_NO_EXPORT PyObject * PyArray_ArangeObj \ - (PyObject *, PyObject *, PyObject *, PyArray_Descr *); -NPY_NO_EXPORT int PyArray_SortkindConverter \ - (PyObject *, NPY_SORTKIND *); -NPY_NO_EXPORT PyObject * PyArray_LexSort \ - (PyObject *, int); -NPY_NO_EXPORT PyObject * PyArray_Round \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT unsigned char PyArray_EquivTypenums \ - (int, int); -NPY_NO_EXPORT int PyArray_RegisterDataType \ - (PyArray_Descr *); -NPY_NO_EXPORT int PyArray_RegisterCastFunc \ - (PyArray_Descr *, int, PyArray_VectorUnaryFunc *); -NPY_NO_EXPORT int PyArray_RegisterCanCast \ - (PyArray_Descr *, int, NPY_SCALARKIND); -NPY_NO_EXPORT void PyArray_InitArrFuncs \ - (PyArray_ArrFuncs *); -NPY_NO_EXPORT PyObject * PyArray_IntTupleFromIntp \ - (int, npy_intp *); -NPY_NO_EXPORT int PyArray_TypeNumFromName \ - (char *); -NPY_NO_EXPORT int PyArray_ClipmodeConverter \ - (PyObject *, NPY_CLIPMODE *); -NPY_NO_EXPORT int PyArray_OutputConverter \ - (PyObject *, PyArrayObject **); -NPY_NO_EXPORT PyObject * PyArray_BroadcastToShape \ - (PyObject *, npy_intp *, int); -NPY_NO_EXPORT void _PyArray_SigintHandler \ - (int); -NPY_NO_EXPORT void* _PyArray_GetSigintBuf \ - (void); -NPY_NO_EXPORT int PyArray_DescrAlignConverter \ - (PyObject *, PyArray_Descr **); -NPY_NO_EXPORT int PyArray_DescrAlignConverter2 \ - (PyObject *, PyArray_Descr **); -NPY_NO_EXPORT int PyArray_SearchsideConverter \ - (PyObject *, void *); -NPY_NO_EXPORT PyObject * PyArray_CheckAxis \ - (PyArrayObject *, int *, int); -NPY_NO_EXPORT npy_intp PyArray_OverflowMultiplyList \ - (npy_intp *, int); -NPY_NO_EXPORT int PyArray_CompareString \ - (char *, char *, size_t); -NPY_NO_EXPORT PyObject * PyArray_MultiIterFromObjects \ - (PyObject **, int, int, ...); -NPY_NO_EXPORT int PyArray_GetEndianness \ - (void); -NPY_NO_EXPORT unsigned int PyArray_GetNDArrayCFeatureVersion \ - (void); -NPY_NO_EXPORT PyObject * PyArray_Correlate2 \ - (PyObject *, PyObject *, int); -NPY_NO_EXPORT PyObject* PyArray_NeighborhoodIterNew \ - (PyArrayIterObject *, npy_intp *, int, PyArrayObject*); -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyTimeIntegerArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyTimeIntegerArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyDatetimeArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyDatetimeArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyTimedeltaArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyTimedeltaArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyHalfArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyHalfArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject NpyIter_Type; -#else - NPY_NO_EXPORT PyTypeObject NpyIter_Type; -#endif - -NPY_NO_EXPORT void PyArray_SetDatetimeParseFunction \ - (PyObject *); -NPY_NO_EXPORT void PyArray_DatetimeToDatetimeStruct \ - (npy_datetime, NPY_DATETIMEUNIT, npy_datetimestruct *); -NPY_NO_EXPORT void PyArray_TimedeltaToTimedeltaStruct \ - (npy_timedelta, NPY_DATETIMEUNIT, npy_timedeltastruct *); -NPY_NO_EXPORT npy_datetime PyArray_DatetimeStructToDatetime \ - (NPY_DATETIMEUNIT, npy_datetimestruct *); -NPY_NO_EXPORT npy_datetime PyArray_TimedeltaStructToTimedelta \ - (NPY_DATETIMEUNIT, npy_timedeltastruct *); -NPY_NO_EXPORT NpyIter * NpyIter_New \ - (PyArrayObject *, npy_uint32, NPY_ORDER, NPY_CASTING, PyArray_Descr*); -NPY_NO_EXPORT NpyIter * NpyIter_MultiNew \ - (int, PyArrayObject **, npy_uint32, NPY_ORDER, NPY_CASTING, npy_uint32 *, PyArray_Descr **); -NPY_NO_EXPORT NpyIter * NpyIter_AdvancedNew \ - (int, PyArrayObject **, npy_uint32, NPY_ORDER, NPY_CASTING, npy_uint32 *, PyArray_Descr **, int, int **, npy_intp *, npy_intp); -NPY_NO_EXPORT NpyIter * NpyIter_Copy \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_Deallocate \ - (NpyIter *); -NPY_NO_EXPORT npy_bool NpyIter_HasDelayedBufAlloc \ - (NpyIter *); -NPY_NO_EXPORT npy_bool NpyIter_HasExternalLoop \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_EnableExternalLoop \ - (NpyIter *); -NPY_NO_EXPORT npy_intp * NpyIter_GetInnerStrideArray \ - (NpyIter *); -NPY_NO_EXPORT npy_intp * NpyIter_GetInnerLoopSizePtr \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_Reset \ - (NpyIter *, char **); -NPY_NO_EXPORT int NpyIter_ResetBasePointers \ - (NpyIter *, char **, char **); -NPY_NO_EXPORT int NpyIter_ResetToIterIndexRange \ - (NpyIter *, npy_intp, npy_intp, char **); -NPY_NO_EXPORT int NpyIter_GetNDim \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_GetNOp \ - (NpyIter *); -NPY_NO_EXPORT NpyIter_IterNextFunc * NpyIter_GetIterNext \ - (NpyIter *, char **); -NPY_NO_EXPORT npy_intp NpyIter_GetIterSize \ - (NpyIter *); -NPY_NO_EXPORT void NpyIter_GetIterIndexRange \ - (NpyIter *, npy_intp *, npy_intp *); -NPY_NO_EXPORT npy_intp NpyIter_GetIterIndex \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_GotoIterIndex \ - (NpyIter *, npy_intp); -NPY_NO_EXPORT npy_bool NpyIter_HasMultiIndex \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_GetShape \ - (NpyIter *, npy_intp *); -NPY_NO_EXPORT NpyIter_GetMultiIndexFunc * NpyIter_GetGetMultiIndex \ - (NpyIter *, char **); -NPY_NO_EXPORT int NpyIter_GotoMultiIndex \ - (NpyIter *, npy_intp *); -NPY_NO_EXPORT int NpyIter_RemoveMultiIndex \ - (NpyIter *); -NPY_NO_EXPORT npy_bool NpyIter_HasIndex \ - (NpyIter *); -NPY_NO_EXPORT npy_bool NpyIter_IsBuffered \ - (NpyIter *); -NPY_NO_EXPORT npy_bool NpyIter_IsGrowInner \ - (NpyIter *); -NPY_NO_EXPORT npy_intp NpyIter_GetBufferSize \ - (NpyIter *); -NPY_NO_EXPORT npy_intp * NpyIter_GetIndexPtr \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_GotoIndex \ - (NpyIter *, npy_intp); -NPY_NO_EXPORT char ** NpyIter_GetDataPtrArray \ - (NpyIter *); -NPY_NO_EXPORT PyArray_Descr ** NpyIter_GetDescrArray \ - (NpyIter *); -NPY_NO_EXPORT PyArrayObject ** NpyIter_GetOperandArray \ - (NpyIter *); -NPY_NO_EXPORT PyArrayObject * NpyIter_GetIterView \ - (NpyIter *, npy_intp); -NPY_NO_EXPORT void NpyIter_GetReadFlags \ - (NpyIter *, char *); -NPY_NO_EXPORT void NpyIter_GetWriteFlags \ - (NpyIter *, char *); -NPY_NO_EXPORT void NpyIter_DebugPrint \ - (NpyIter *); -NPY_NO_EXPORT npy_bool NpyIter_IterationNeedsAPI \ - (NpyIter *); -NPY_NO_EXPORT void NpyIter_GetInnerFixedStrideArray \ - (NpyIter *, npy_intp *); -NPY_NO_EXPORT int NpyIter_RemoveAxis \ - (NpyIter *, int); -NPY_NO_EXPORT npy_intp * NpyIter_GetAxisStrideArray \ - (NpyIter *, int); -NPY_NO_EXPORT npy_bool NpyIter_RequiresBuffering \ - (NpyIter *); -NPY_NO_EXPORT char ** NpyIter_GetInitialDataPtrArray \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_CreateCompatibleStrides \ - (NpyIter *, npy_intp, npy_intp *); -NPY_NO_EXPORT int PyArray_CastingConverter \ - (PyObject *, NPY_CASTING *); -NPY_NO_EXPORT npy_intp PyArray_CountNonzero \ - (PyArrayObject *); -NPY_NO_EXPORT PyArray_Descr * PyArray_PromoteTypes \ - (PyArray_Descr *, PyArray_Descr *); -NPY_NO_EXPORT PyArray_Descr * PyArray_MinScalarType \ - (PyArrayObject *); -NPY_NO_EXPORT PyArray_Descr * PyArray_ResultType \ - (npy_intp, PyArrayObject **, npy_intp, PyArray_Descr **); -NPY_NO_EXPORT npy_bool PyArray_CanCastArrayTo \ - (PyArrayObject *, PyArray_Descr *, NPY_CASTING); -NPY_NO_EXPORT npy_bool PyArray_CanCastTypeTo \ - (PyArray_Descr *, PyArray_Descr *, NPY_CASTING); -NPY_NO_EXPORT PyArrayObject * PyArray_EinsteinSum \ - (char *, npy_intp, PyArrayObject **, PyArray_Descr *, NPY_ORDER, NPY_CASTING, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_NewLikeArray \ - (PyArrayObject *, NPY_ORDER, PyArray_Descr *, int); -NPY_NO_EXPORT int PyArray_GetArrayParamsFromObject \ - (PyObject *, PyArray_Descr *, npy_bool, PyArray_Descr **, int *, npy_intp *, PyArrayObject **, PyObject *); -NPY_NO_EXPORT int PyArray_ConvertClipmodeSequence \ - (PyObject *, NPY_CLIPMODE *, int); -NPY_NO_EXPORT PyObject * PyArray_MatrixProduct2 \ - (PyObject *, PyObject *, PyArrayObject*); -NPY_NO_EXPORT npy_bool NpyIter_IsFirstVisit \ - (NpyIter *, int); -NPY_NO_EXPORT int PyArray_SetBaseObject \ - (PyArrayObject *, PyObject *); -NPY_NO_EXPORT void PyArray_CreateSortedStridePerm \ - (int, npy_intp *, npy_stride_sort_item *); -NPY_NO_EXPORT void PyArray_RemoveAxesInPlace \ - (PyArrayObject *, npy_bool *); -NPY_NO_EXPORT void PyArray_DebugPrint \ - (PyArrayObject *); -NPY_NO_EXPORT int PyArray_FailUnlessWriteable \ - (PyArrayObject *, const char *); -NPY_NO_EXPORT int PyArray_SetUpdateIfCopyBase \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT void * PyDataMem_NEW \ - (size_t); -NPY_NO_EXPORT void PyDataMem_FREE \ - (void *); -NPY_NO_EXPORT void * PyDataMem_RENEW \ - (void *, size_t); -NPY_NO_EXPORT PyDataMem_EventHookFunc * PyDataMem_SetEventHook \ - (PyDataMem_EventHookFunc *, void *, void **); -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT NPY_CASTING NPY_DEFAULT_ASSIGN_CASTING; -#else - NPY_NO_EXPORT NPY_CASTING NPY_DEFAULT_ASSIGN_CASTING; -#endif - - -#else - -#if defined(PY_ARRAY_UNIQUE_SYMBOL) -#define PyArray_API PY_ARRAY_UNIQUE_SYMBOL -#endif - -#if defined(NO_IMPORT) || defined(NO_IMPORT_ARRAY) -extern void **PyArray_API; -#else -#if defined(PY_ARRAY_UNIQUE_SYMBOL) -void **PyArray_API; -#else -static void **PyArray_API=NULL; -#endif -#endif - -#define PyArray_GetNDArrayCVersion \ - (*(unsigned int (*)(void)) \ - PyArray_API[0]) -#define PyBigArray_Type (*(PyTypeObject *)PyArray_API[1]) -#define PyArray_Type (*(PyTypeObject *)PyArray_API[2]) -#define PyArrayDescr_Type (*(PyTypeObject *)PyArray_API[3]) -#define PyArrayFlags_Type (*(PyTypeObject *)PyArray_API[4]) -#define PyArrayIter_Type (*(PyTypeObject *)PyArray_API[5]) -#define PyArrayMultiIter_Type (*(PyTypeObject *)PyArray_API[6]) -#define NPY_NUMUSERTYPES (*(int *)PyArray_API[7]) -#define PyBoolArrType_Type (*(PyTypeObject *)PyArray_API[8]) -#define _PyArrayScalar_BoolValues ((PyBoolScalarObject *)PyArray_API[9]) -#define PyGenericArrType_Type (*(PyTypeObject *)PyArray_API[10]) -#define PyNumberArrType_Type (*(PyTypeObject *)PyArray_API[11]) -#define PyIntegerArrType_Type (*(PyTypeObject *)PyArray_API[12]) -#define PySignedIntegerArrType_Type (*(PyTypeObject *)PyArray_API[13]) -#define PyUnsignedIntegerArrType_Type (*(PyTypeObject *)PyArray_API[14]) -#define PyInexactArrType_Type (*(PyTypeObject *)PyArray_API[15]) -#define PyFloatingArrType_Type (*(PyTypeObject *)PyArray_API[16]) -#define PyComplexFloatingArrType_Type (*(PyTypeObject *)PyArray_API[17]) -#define PyFlexibleArrType_Type (*(PyTypeObject *)PyArray_API[18]) -#define PyCharacterArrType_Type (*(PyTypeObject *)PyArray_API[19]) -#define PyByteArrType_Type (*(PyTypeObject *)PyArray_API[20]) -#define PyShortArrType_Type (*(PyTypeObject *)PyArray_API[21]) -#define PyIntArrType_Type (*(PyTypeObject *)PyArray_API[22]) -#define PyLongArrType_Type (*(PyTypeObject *)PyArray_API[23]) -#define PyLongLongArrType_Type (*(PyTypeObject *)PyArray_API[24]) -#define PyUByteArrType_Type (*(PyTypeObject *)PyArray_API[25]) -#define PyUShortArrType_Type (*(PyTypeObject *)PyArray_API[26]) -#define PyUIntArrType_Type (*(PyTypeObject *)PyArray_API[27]) -#define PyULongArrType_Type (*(PyTypeObject *)PyArray_API[28]) -#define PyULongLongArrType_Type (*(PyTypeObject *)PyArray_API[29]) -#define PyFloatArrType_Type (*(PyTypeObject *)PyArray_API[30]) -#define PyDoubleArrType_Type (*(PyTypeObject *)PyArray_API[31]) -#define PyLongDoubleArrType_Type (*(PyTypeObject *)PyArray_API[32]) -#define PyCFloatArrType_Type (*(PyTypeObject *)PyArray_API[33]) -#define PyCDoubleArrType_Type (*(PyTypeObject *)PyArray_API[34]) -#define PyCLongDoubleArrType_Type (*(PyTypeObject *)PyArray_API[35]) -#define PyObjectArrType_Type (*(PyTypeObject *)PyArray_API[36]) -#define PyStringArrType_Type (*(PyTypeObject *)PyArray_API[37]) -#define PyUnicodeArrType_Type (*(PyTypeObject *)PyArray_API[38]) -#define PyVoidArrType_Type (*(PyTypeObject *)PyArray_API[39]) -#define PyArray_SetNumericOps \ - (*(int (*)(PyObject *)) \ - PyArray_API[40]) -#define PyArray_GetNumericOps \ - (*(PyObject * (*)(void)) \ - PyArray_API[41]) -#define PyArray_INCREF \ - (*(int (*)(PyArrayObject *)) \ - PyArray_API[42]) -#define PyArray_XDECREF \ - (*(int (*)(PyArrayObject *)) \ - PyArray_API[43]) -#define PyArray_SetStringFunction \ - (*(void (*)(PyObject *, int)) \ - PyArray_API[44]) -#define PyArray_DescrFromType \ - (*(PyArray_Descr * (*)(int)) \ - PyArray_API[45]) -#define PyArray_TypeObjectFromType \ - (*(PyObject * (*)(int)) \ - PyArray_API[46]) -#define PyArray_Zero \ - (*(char * (*)(PyArrayObject *)) \ - PyArray_API[47]) -#define PyArray_One \ - (*(char * (*)(PyArrayObject *)) \ - PyArray_API[48]) -#define PyArray_CastToType \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Descr *, int)) \ - PyArray_API[49]) -#define PyArray_CastTo \ - (*(int (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[50]) -#define PyArray_CastAnyTo \ - (*(int (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[51]) -#define PyArray_CanCastSafely \ - (*(int (*)(int, int)) \ - PyArray_API[52]) -#define PyArray_CanCastTo \ - (*(npy_bool (*)(PyArray_Descr *, PyArray_Descr *)) \ - PyArray_API[53]) -#define PyArray_ObjectType \ - (*(int (*)(PyObject *, int)) \ - PyArray_API[54]) -#define PyArray_DescrFromObject \ - (*(PyArray_Descr * (*)(PyObject *, PyArray_Descr *)) \ - PyArray_API[55]) -#define PyArray_ConvertToCommonType \ - (*(PyArrayObject ** (*)(PyObject *, int *)) \ - PyArray_API[56]) -#define PyArray_DescrFromScalar \ - (*(PyArray_Descr * (*)(PyObject *)) \ - PyArray_API[57]) -#define PyArray_DescrFromTypeObject \ - (*(PyArray_Descr * (*)(PyObject *)) \ - PyArray_API[58]) -#define PyArray_Size \ - (*(npy_intp (*)(PyObject *)) \ - PyArray_API[59]) -#define PyArray_Scalar \ - (*(PyObject * (*)(void *, PyArray_Descr *, PyObject *)) \ - PyArray_API[60]) -#define PyArray_FromScalar \ - (*(PyObject * (*)(PyObject *, PyArray_Descr *)) \ - PyArray_API[61]) -#define PyArray_ScalarAsCtype \ - (*(void (*)(PyObject *, void *)) \ - PyArray_API[62]) -#define PyArray_CastScalarToCtype \ - (*(int (*)(PyObject *, void *, PyArray_Descr *)) \ - PyArray_API[63]) -#define PyArray_CastScalarDirect \ - (*(int (*)(PyObject *, PyArray_Descr *, void *, int)) \ - PyArray_API[64]) -#define PyArray_ScalarFromObject \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[65]) -#define PyArray_GetCastFunc \ - (*(PyArray_VectorUnaryFunc * (*)(PyArray_Descr *, int)) \ - PyArray_API[66]) -#define PyArray_FromDims \ - (*(PyObject * (*)(int, int *, int)) \ - PyArray_API[67]) -#define PyArray_FromDimsAndDataAndDescr \ - (*(PyObject * (*)(int, int *, PyArray_Descr *, char *)) \ - PyArray_API[68]) -#define PyArray_FromAny \ - (*(PyObject * (*)(PyObject *, PyArray_Descr *, int, int, int, PyObject *)) \ - PyArray_API[69]) -#define PyArray_EnsureArray \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[70]) -#define PyArray_EnsureAnyArray \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[71]) -#define PyArray_FromFile \ - (*(PyObject * (*)(FILE *, PyArray_Descr *, npy_intp, char *)) \ - PyArray_API[72]) -#define PyArray_FromString \ - (*(PyObject * (*)(char *, npy_intp, PyArray_Descr *, npy_intp, char *)) \ - PyArray_API[73]) -#define PyArray_FromBuffer \ - (*(PyObject * (*)(PyObject *, PyArray_Descr *, npy_intp, npy_intp)) \ - PyArray_API[74]) -#define PyArray_FromIter \ - (*(PyObject * (*)(PyObject *, PyArray_Descr *, npy_intp)) \ - PyArray_API[75]) -#define PyArray_Return \ - (*(PyObject * (*)(PyArrayObject *)) \ - PyArray_API[76]) -#define PyArray_GetField \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Descr *, int)) \ - PyArray_API[77]) -#define PyArray_SetField \ - (*(int (*)(PyArrayObject *, PyArray_Descr *, int, PyObject *)) \ - PyArray_API[78]) -#define PyArray_Byteswap \ - (*(PyObject * (*)(PyArrayObject *, npy_bool)) \ - PyArray_API[79]) -#define PyArray_Resize \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Dims *, int, NPY_ORDER)) \ - PyArray_API[80]) -#define PyArray_MoveInto \ - (*(int (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[81]) -#define PyArray_CopyInto \ - (*(int (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[82]) -#define PyArray_CopyAnyInto \ - (*(int (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[83]) -#define PyArray_CopyObject \ - (*(int (*)(PyArrayObject *, PyObject *)) \ - PyArray_API[84]) -#define PyArray_NewCopy \ - (*(PyObject * (*)(PyArrayObject *, NPY_ORDER)) \ - PyArray_API[85]) -#define PyArray_ToList \ - (*(PyObject * (*)(PyArrayObject *)) \ - PyArray_API[86]) -#define PyArray_ToString \ - (*(PyObject * (*)(PyArrayObject *, NPY_ORDER)) \ - PyArray_API[87]) -#define PyArray_ToFile \ - (*(int (*)(PyArrayObject *, FILE *, char *, char *)) \ - PyArray_API[88]) -#define PyArray_Dump \ - (*(int (*)(PyObject *, PyObject *, int)) \ - PyArray_API[89]) -#define PyArray_Dumps \ - (*(PyObject * (*)(PyObject *, int)) \ - PyArray_API[90]) -#define PyArray_ValidType \ - (*(int (*)(int)) \ - PyArray_API[91]) -#define PyArray_UpdateFlags \ - (*(void (*)(PyArrayObject *, int)) \ - PyArray_API[92]) -#define PyArray_New \ - (*(PyObject * (*)(PyTypeObject *, int, npy_intp *, int, npy_intp *, void *, int, int, PyObject *)) \ - PyArray_API[93]) -#define PyArray_NewFromDescr \ - (*(PyObject * (*)(PyTypeObject *, PyArray_Descr *, int, npy_intp *, npy_intp *, void *, int, PyObject *)) \ - PyArray_API[94]) -#define PyArray_DescrNew \ - (*(PyArray_Descr * (*)(PyArray_Descr *)) \ - PyArray_API[95]) -#define PyArray_DescrNewFromType \ - (*(PyArray_Descr * (*)(int)) \ - PyArray_API[96]) -#define PyArray_GetPriority \ - (*(double (*)(PyObject *, double)) \ - PyArray_API[97]) -#define PyArray_IterNew \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[98]) -#define PyArray_MultiIterNew \ - (*(PyObject * (*)(int, ...)) \ - PyArray_API[99]) -#define PyArray_PyIntAsInt \ - (*(int (*)(PyObject *)) \ - PyArray_API[100]) -#define PyArray_PyIntAsIntp \ - (*(npy_intp (*)(PyObject *)) \ - PyArray_API[101]) -#define PyArray_Broadcast \ - (*(int (*)(PyArrayMultiIterObject *)) \ - PyArray_API[102]) -#define PyArray_FillObjectArray \ - (*(void (*)(PyArrayObject *, PyObject *)) \ - PyArray_API[103]) -#define PyArray_FillWithScalar \ - (*(int (*)(PyArrayObject *, PyObject *)) \ - PyArray_API[104]) -#define PyArray_CheckStrides \ - (*(npy_bool (*)(int, int, npy_intp, npy_intp, npy_intp *, npy_intp *)) \ - PyArray_API[105]) -#define PyArray_DescrNewByteorder \ - (*(PyArray_Descr * (*)(PyArray_Descr *, char)) \ - PyArray_API[106]) -#define PyArray_IterAllButAxis \ - (*(PyObject * (*)(PyObject *, int *)) \ - PyArray_API[107]) -#define PyArray_CheckFromAny \ - (*(PyObject * (*)(PyObject *, PyArray_Descr *, int, int, int, PyObject *)) \ - PyArray_API[108]) -#define PyArray_FromArray \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Descr *, int)) \ - PyArray_API[109]) -#define PyArray_FromInterface \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[110]) -#define PyArray_FromStructInterface \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[111]) -#define PyArray_FromArrayAttr \ - (*(PyObject * (*)(PyObject *, PyArray_Descr *, PyObject *)) \ - PyArray_API[112]) -#define PyArray_ScalarKind \ - (*(NPY_SCALARKIND (*)(int, PyArrayObject **)) \ - PyArray_API[113]) -#define PyArray_CanCoerceScalar \ - (*(int (*)(int, int, NPY_SCALARKIND)) \ - PyArray_API[114]) -#define PyArray_NewFlagsObject \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[115]) -#define PyArray_CanCastScalar \ - (*(npy_bool (*)(PyTypeObject *, PyTypeObject *)) \ - PyArray_API[116]) -#define PyArray_CompareUCS4 \ - (*(int (*)(npy_ucs4 *, npy_ucs4 *, size_t)) \ - PyArray_API[117]) -#define PyArray_RemoveSmallest \ - (*(int (*)(PyArrayMultiIterObject *)) \ - PyArray_API[118]) -#define PyArray_ElementStrides \ - (*(int (*)(PyObject *)) \ - PyArray_API[119]) -#define PyArray_Item_INCREF \ - (*(void (*)(char *, PyArray_Descr *)) \ - PyArray_API[120]) -#define PyArray_Item_XDECREF \ - (*(void (*)(char *, PyArray_Descr *)) \ - PyArray_API[121]) -#define PyArray_FieldNames \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[122]) -#define PyArray_Transpose \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Dims *)) \ - PyArray_API[123]) -#define PyArray_TakeFrom \ - (*(PyObject * (*)(PyArrayObject *, PyObject *, int, PyArrayObject *, NPY_CLIPMODE)) \ - PyArray_API[124]) -#define PyArray_PutTo \ - (*(PyObject * (*)(PyArrayObject *, PyObject*, PyObject *, NPY_CLIPMODE)) \ - PyArray_API[125]) -#define PyArray_PutMask \ - (*(PyObject * (*)(PyArrayObject *, PyObject*, PyObject*)) \ - PyArray_API[126]) -#define PyArray_Repeat \ - (*(PyObject * (*)(PyArrayObject *, PyObject *, int)) \ - PyArray_API[127]) -#define PyArray_Choose \ - (*(PyObject * (*)(PyArrayObject *, PyObject *, PyArrayObject *, NPY_CLIPMODE)) \ - PyArray_API[128]) -#define PyArray_Sort \ - (*(int (*)(PyArrayObject *, int, NPY_SORTKIND)) \ - PyArray_API[129]) -#define PyArray_ArgSort \ - (*(PyObject * (*)(PyArrayObject *, int, NPY_SORTKIND)) \ - PyArray_API[130]) -#define PyArray_SearchSorted \ - (*(PyObject * (*)(PyArrayObject *, PyObject *, NPY_SEARCHSIDE, PyObject *)) \ - PyArray_API[131]) -#define PyArray_ArgMax \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[132]) -#define PyArray_ArgMin \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[133]) -#define PyArray_Reshape \ - (*(PyObject * (*)(PyArrayObject *, PyObject *)) \ - PyArray_API[134]) -#define PyArray_Newshape \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Dims *, NPY_ORDER)) \ - PyArray_API[135]) -#define PyArray_Squeeze \ - (*(PyObject * (*)(PyArrayObject *)) \ - PyArray_API[136]) -#define PyArray_View \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Descr *, PyTypeObject *)) \ - PyArray_API[137]) -#define PyArray_SwapAxes \ - (*(PyObject * (*)(PyArrayObject *, int, int)) \ - PyArray_API[138]) -#define PyArray_Max \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[139]) -#define PyArray_Min \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[140]) -#define PyArray_Ptp \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[141]) -#define PyArray_Mean \ - (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \ - PyArray_API[142]) -#define PyArray_Trace \ - (*(PyObject * (*)(PyArrayObject *, int, int, int, int, PyArrayObject *)) \ - PyArray_API[143]) -#define PyArray_Diagonal \ - (*(PyObject * (*)(PyArrayObject *, int, int, int)) \ - PyArray_API[144]) -#define PyArray_Clip \ - (*(PyObject * (*)(PyArrayObject *, PyObject *, PyObject *, PyArrayObject *)) \ - PyArray_API[145]) -#define PyArray_Conjugate \ - (*(PyObject * (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[146]) -#define PyArray_Nonzero \ - (*(PyObject * (*)(PyArrayObject *)) \ - PyArray_API[147]) -#define PyArray_Std \ - (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *, int)) \ - PyArray_API[148]) -#define PyArray_Sum \ - (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \ - PyArray_API[149]) -#define PyArray_CumSum \ - (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \ - PyArray_API[150]) -#define PyArray_Prod \ - (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \ - PyArray_API[151]) -#define PyArray_CumProd \ - (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \ - PyArray_API[152]) -#define PyArray_All \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[153]) -#define PyArray_Any \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[154]) -#define PyArray_Compress \ - (*(PyObject * (*)(PyArrayObject *, PyObject *, int, PyArrayObject *)) \ - PyArray_API[155]) -#define PyArray_Flatten \ - (*(PyObject * (*)(PyArrayObject *, NPY_ORDER)) \ - PyArray_API[156]) -#define PyArray_Ravel \ - (*(PyObject * (*)(PyArrayObject *, NPY_ORDER)) \ - PyArray_API[157]) -#define PyArray_MultiplyList \ - (*(npy_intp (*)(npy_intp *, int)) \ - PyArray_API[158]) -#define PyArray_MultiplyIntList \ - (*(int (*)(int *, int)) \ - PyArray_API[159]) -#define PyArray_GetPtr \ - (*(void * (*)(PyArrayObject *, npy_intp*)) \ - PyArray_API[160]) -#define PyArray_CompareLists \ - (*(int (*)(npy_intp *, npy_intp *, int)) \ - PyArray_API[161]) -#define PyArray_AsCArray \ - (*(int (*)(PyObject **, void *, npy_intp *, int, PyArray_Descr*)) \ - PyArray_API[162]) -#define PyArray_As1D \ - (*(int (*)(PyObject **, char **, int *, int)) \ - PyArray_API[163]) -#define PyArray_As2D \ - (*(int (*)(PyObject **, char ***, int *, int *, int)) \ - PyArray_API[164]) -#define PyArray_Free \ - (*(int (*)(PyObject *, void *)) \ - PyArray_API[165]) -#define PyArray_Converter \ - (*(int (*)(PyObject *, PyObject **)) \ - PyArray_API[166]) -#define PyArray_IntpFromSequence \ - (*(int (*)(PyObject *, npy_intp *, int)) \ - PyArray_API[167]) -#define PyArray_Concatenate \ - (*(PyObject * (*)(PyObject *, int)) \ - PyArray_API[168]) -#define PyArray_InnerProduct \ - (*(PyObject * (*)(PyObject *, PyObject *)) \ - PyArray_API[169]) -#define PyArray_MatrixProduct \ - (*(PyObject * (*)(PyObject *, PyObject *)) \ - PyArray_API[170]) -#define PyArray_CopyAndTranspose \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[171]) -#define PyArray_Correlate \ - (*(PyObject * (*)(PyObject *, PyObject *, int)) \ - PyArray_API[172]) -#define PyArray_TypestrConvert \ - (*(int (*)(int, int)) \ - PyArray_API[173]) -#define PyArray_DescrConverter \ - (*(int (*)(PyObject *, PyArray_Descr **)) \ - PyArray_API[174]) -#define PyArray_DescrConverter2 \ - (*(int (*)(PyObject *, PyArray_Descr **)) \ - PyArray_API[175]) -#define PyArray_IntpConverter \ - (*(int (*)(PyObject *, PyArray_Dims *)) \ - PyArray_API[176]) -#define PyArray_BufferConverter \ - (*(int (*)(PyObject *, PyArray_Chunk *)) \ - PyArray_API[177]) -#define PyArray_AxisConverter \ - (*(int (*)(PyObject *, int *)) \ - PyArray_API[178]) -#define PyArray_BoolConverter \ - (*(int (*)(PyObject *, npy_bool *)) \ - PyArray_API[179]) -#define PyArray_ByteorderConverter \ - (*(int (*)(PyObject *, char *)) \ - PyArray_API[180]) -#define PyArray_OrderConverter \ - (*(int (*)(PyObject *, NPY_ORDER *)) \ - PyArray_API[181]) -#define PyArray_EquivTypes \ - (*(unsigned char (*)(PyArray_Descr *, PyArray_Descr *)) \ - PyArray_API[182]) -#define PyArray_Zeros \ - (*(PyObject * (*)(int, npy_intp *, PyArray_Descr *, int)) \ - PyArray_API[183]) -#define PyArray_Empty \ - (*(PyObject * (*)(int, npy_intp *, PyArray_Descr *, int)) \ - PyArray_API[184]) -#define PyArray_Where \ - (*(PyObject * (*)(PyObject *, PyObject *, PyObject *)) \ - PyArray_API[185]) -#define PyArray_Arange \ - (*(PyObject * (*)(double, double, double, int)) \ - PyArray_API[186]) -#define PyArray_ArangeObj \ - (*(PyObject * (*)(PyObject *, PyObject *, PyObject *, PyArray_Descr *)) \ - PyArray_API[187]) -#define PyArray_SortkindConverter \ - (*(int (*)(PyObject *, NPY_SORTKIND *)) \ - PyArray_API[188]) -#define PyArray_LexSort \ - (*(PyObject * (*)(PyObject *, int)) \ - PyArray_API[189]) -#define PyArray_Round \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[190]) -#define PyArray_EquivTypenums \ - (*(unsigned char (*)(int, int)) \ - PyArray_API[191]) -#define PyArray_RegisterDataType \ - (*(int (*)(PyArray_Descr *)) \ - PyArray_API[192]) -#define PyArray_RegisterCastFunc \ - (*(int (*)(PyArray_Descr *, int, PyArray_VectorUnaryFunc *)) \ - PyArray_API[193]) -#define PyArray_RegisterCanCast \ - (*(int (*)(PyArray_Descr *, int, NPY_SCALARKIND)) \ - PyArray_API[194]) -#define PyArray_InitArrFuncs \ - (*(void (*)(PyArray_ArrFuncs *)) \ - PyArray_API[195]) -#define PyArray_IntTupleFromIntp \ - (*(PyObject * (*)(int, npy_intp *)) \ - PyArray_API[196]) -#define PyArray_TypeNumFromName \ - (*(int (*)(char *)) \ - PyArray_API[197]) -#define PyArray_ClipmodeConverter \ - (*(int (*)(PyObject *, NPY_CLIPMODE *)) \ - PyArray_API[198]) -#define PyArray_OutputConverter \ - (*(int (*)(PyObject *, PyArrayObject **)) \ - PyArray_API[199]) -#define PyArray_BroadcastToShape \ - (*(PyObject * (*)(PyObject *, npy_intp *, int)) \ - PyArray_API[200]) -#define _PyArray_SigintHandler \ - (*(void (*)(int)) \ - PyArray_API[201]) -#define _PyArray_GetSigintBuf \ - (*(void* (*)(void)) \ - PyArray_API[202]) -#define PyArray_DescrAlignConverter \ - (*(int (*)(PyObject *, PyArray_Descr **)) \ - PyArray_API[203]) -#define PyArray_DescrAlignConverter2 \ - (*(int (*)(PyObject *, PyArray_Descr **)) \ - PyArray_API[204]) -#define PyArray_SearchsideConverter \ - (*(int (*)(PyObject *, void *)) \ - PyArray_API[205]) -#define PyArray_CheckAxis \ - (*(PyObject * (*)(PyArrayObject *, int *, int)) \ - PyArray_API[206]) -#define PyArray_OverflowMultiplyList \ - (*(npy_intp (*)(npy_intp *, int)) \ - PyArray_API[207]) -#define PyArray_CompareString \ - (*(int (*)(char *, char *, size_t)) \ - PyArray_API[208]) -#define PyArray_MultiIterFromObjects \ - (*(PyObject * (*)(PyObject **, int, int, ...)) \ - PyArray_API[209]) -#define PyArray_GetEndianness \ - (*(int (*)(void)) \ - PyArray_API[210]) -#define PyArray_GetNDArrayCFeatureVersion \ - (*(unsigned int (*)(void)) \ - PyArray_API[211]) -#define PyArray_Correlate2 \ - (*(PyObject * (*)(PyObject *, PyObject *, int)) \ - PyArray_API[212]) -#define PyArray_NeighborhoodIterNew \ - (*(PyObject* (*)(PyArrayIterObject *, npy_intp *, int, PyArrayObject*)) \ - PyArray_API[213]) -#define PyTimeIntegerArrType_Type (*(PyTypeObject *)PyArray_API[214]) -#define PyDatetimeArrType_Type (*(PyTypeObject *)PyArray_API[215]) -#define PyTimedeltaArrType_Type (*(PyTypeObject *)PyArray_API[216]) -#define PyHalfArrType_Type (*(PyTypeObject *)PyArray_API[217]) -#define NpyIter_Type (*(PyTypeObject *)PyArray_API[218]) -#define PyArray_SetDatetimeParseFunction \ - (*(void (*)(PyObject *)) \ - PyArray_API[219]) -#define PyArray_DatetimeToDatetimeStruct \ - (*(void (*)(npy_datetime, NPY_DATETIMEUNIT, npy_datetimestruct *)) \ - PyArray_API[220]) -#define PyArray_TimedeltaToTimedeltaStruct \ - (*(void (*)(npy_timedelta, NPY_DATETIMEUNIT, npy_timedeltastruct *)) \ - PyArray_API[221]) -#define PyArray_DatetimeStructToDatetime \ - (*(npy_datetime (*)(NPY_DATETIMEUNIT, npy_datetimestruct *)) \ - PyArray_API[222]) -#define PyArray_TimedeltaStructToTimedelta \ - (*(npy_datetime (*)(NPY_DATETIMEUNIT, npy_timedeltastruct *)) \ - PyArray_API[223]) -#define NpyIter_New \ - (*(NpyIter * (*)(PyArrayObject *, npy_uint32, NPY_ORDER, NPY_CASTING, PyArray_Descr*)) \ - PyArray_API[224]) -#define NpyIter_MultiNew \ - (*(NpyIter * (*)(int, PyArrayObject **, npy_uint32, NPY_ORDER, NPY_CASTING, npy_uint32 *, PyArray_Descr **)) \ - PyArray_API[225]) -#define NpyIter_AdvancedNew \ - (*(NpyIter * (*)(int, PyArrayObject **, npy_uint32, NPY_ORDER, NPY_CASTING, npy_uint32 *, PyArray_Descr **, int, int **, npy_intp *, npy_intp)) \ - PyArray_API[226]) -#define NpyIter_Copy \ - (*(NpyIter * (*)(NpyIter *)) \ - PyArray_API[227]) -#define NpyIter_Deallocate \ - (*(int (*)(NpyIter *)) \ - PyArray_API[228]) -#define NpyIter_HasDelayedBufAlloc \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[229]) -#define NpyIter_HasExternalLoop \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[230]) -#define NpyIter_EnableExternalLoop \ - (*(int (*)(NpyIter *)) \ - PyArray_API[231]) -#define NpyIter_GetInnerStrideArray \ - (*(npy_intp * (*)(NpyIter *)) \ - PyArray_API[232]) -#define NpyIter_GetInnerLoopSizePtr \ - (*(npy_intp * (*)(NpyIter *)) \ - PyArray_API[233]) -#define NpyIter_Reset \ - (*(int (*)(NpyIter *, char **)) \ - PyArray_API[234]) -#define NpyIter_ResetBasePointers \ - (*(int (*)(NpyIter *, char **, char **)) \ - PyArray_API[235]) -#define NpyIter_ResetToIterIndexRange \ - (*(int (*)(NpyIter *, npy_intp, npy_intp, char **)) \ - PyArray_API[236]) -#define NpyIter_GetNDim \ - (*(int (*)(NpyIter *)) \ - PyArray_API[237]) -#define NpyIter_GetNOp \ - (*(int (*)(NpyIter *)) \ - PyArray_API[238]) -#define NpyIter_GetIterNext \ - (*(NpyIter_IterNextFunc * (*)(NpyIter *, char **)) \ - PyArray_API[239]) -#define NpyIter_GetIterSize \ - (*(npy_intp (*)(NpyIter *)) \ - PyArray_API[240]) -#define NpyIter_GetIterIndexRange \ - (*(void (*)(NpyIter *, npy_intp *, npy_intp *)) \ - PyArray_API[241]) -#define NpyIter_GetIterIndex \ - (*(npy_intp (*)(NpyIter *)) \ - PyArray_API[242]) -#define NpyIter_GotoIterIndex \ - (*(int (*)(NpyIter *, npy_intp)) \ - PyArray_API[243]) -#define NpyIter_HasMultiIndex \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[244]) -#define NpyIter_GetShape \ - (*(int (*)(NpyIter *, npy_intp *)) \ - PyArray_API[245]) -#define NpyIter_GetGetMultiIndex \ - (*(NpyIter_GetMultiIndexFunc * (*)(NpyIter *, char **)) \ - PyArray_API[246]) -#define NpyIter_GotoMultiIndex \ - (*(int (*)(NpyIter *, npy_intp *)) \ - PyArray_API[247]) -#define NpyIter_RemoveMultiIndex \ - (*(int (*)(NpyIter *)) \ - PyArray_API[248]) -#define NpyIter_HasIndex \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[249]) -#define NpyIter_IsBuffered \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[250]) -#define NpyIter_IsGrowInner \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[251]) -#define NpyIter_GetBufferSize \ - (*(npy_intp (*)(NpyIter *)) \ - PyArray_API[252]) -#define NpyIter_GetIndexPtr \ - (*(npy_intp * (*)(NpyIter *)) \ - PyArray_API[253]) -#define NpyIter_GotoIndex \ - (*(int (*)(NpyIter *, npy_intp)) \ - PyArray_API[254]) -#define NpyIter_GetDataPtrArray \ - (*(char ** (*)(NpyIter *)) \ - PyArray_API[255]) -#define NpyIter_GetDescrArray \ - (*(PyArray_Descr ** (*)(NpyIter *)) \ - PyArray_API[256]) -#define NpyIter_GetOperandArray \ - (*(PyArrayObject ** (*)(NpyIter *)) \ - PyArray_API[257]) -#define NpyIter_GetIterView \ - (*(PyArrayObject * (*)(NpyIter *, npy_intp)) \ - PyArray_API[258]) -#define NpyIter_GetReadFlags \ - (*(void (*)(NpyIter *, char *)) \ - PyArray_API[259]) -#define NpyIter_GetWriteFlags \ - (*(void (*)(NpyIter *, char *)) \ - PyArray_API[260]) -#define NpyIter_DebugPrint \ - (*(void (*)(NpyIter *)) \ - PyArray_API[261]) -#define NpyIter_IterationNeedsAPI \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[262]) -#define NpyIter_GetInnerFixedStrideArray \ - (*(void (*)(NpyIter *, npy_intp *)) \ - PyArray_API[263]) -#define NpyIter_RemoveAxis \ - (*(int (*)(NpyIter *, int)) \ - PyArray_API[264]) -#define NpyIter_GetAxisStrideArray \ - (*(npy_intp * (*)(NpyIter *, int)) \ - PyArray_API[265]) -#define NpyIter_RequiresBuffering \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[266]) -#define NpyIter_GetInitialDataPtrArray \ - (*(char ** (*)(NpyIter *)) \ - PyArray_API[267]) -#define NpyIter_CreateCompatibleStrides \ - (*(int (*)(NpyIter *, npy_intp, npy_intp *)) \ - PyArray_API[268]) -#define PyArray_CastingConverter \ - (*(int (*)(PyObject *, NPY_CASTING *)) \ - PyArray_API[269]) -#define PyArray_CountNonzero \ - (*(npy_intp (*)(PyArrayObject *)) \ - PyArray_API[270]) -#define PyArray_PromoteTypes \ - (*(PyArray_Descr * (*)(PyArray_Descr *, PyArray_Descr *)) \ - PyArray_API[271]) -#define PyArray_MinScalarType \ - (*(PyArray_Descr * (*)(PyArrayObject *)) \ - PyArray_API[272]) -#define PyArray_ResultType \ - (*(PyArray_Descr * (*)(npy_intp, PyArrayObject **, npy_intp, PyArray_Descr **)) \ - PyArray_API[273]) -#define PyArray_CanCastArrayTo \ - (*(npy_bool (*)(PyArrayObject *, PyArray_Descr *, NPY_CASTING)) \ - PyArray_API[274]) -#define PyArray_CanCastTypeTo \ - (*(npy_bool (*)(PyArray_Descr *, PyArray_Descr *, NPY_CASTING)) \ - PyArray_API[275]) -#define PyArray_EinsteinSum \ - (*(PyArrayObject * (*)(char *, npy_intp, PyArrayObject **, PyArray_Descr *, NPY_ORDER, NPY_CASTING, PyArrayObject *)) \ - PyArray_API[276]) -#define PyArray_NewLikeArray \ - (*(PyObject * (*)(PyArrayObject *, NPY_ORDER, PyArray_Descr *, int)) \ - PyArray_API[277]) -#define PyArray_GetArrayParamsFromObject \ - (*(int (*)(PyObject *, PyArray_Descr *, npy_bool, PyArray_Descr **, int *, npy_intp *, PyArrayObject **, PyObject *)) \ - PyArray_API[278]) -#define PyArray_ConvertClipmodeSequence \ - (*(int (*)(PyObject *, NPY_CLIPMODE *, int)) \ - PyArray_API[279]) -#define PyArray_MatrixProduct2 \ - (*(PyObject * (*)(PyObject *, PyObject *, PyArrayObject*)) \ - PyArray_API[280]) -#define NpyIter_IsFirstVisit \ - (*(npy_bool (*)(NpyIter *, int)) \ - PyArray_API[281]) -#define PyArray_SetBaseObject \ - (*(int (*)(PyArrayObject *, PyObject *)) \ - PyArray_API[282]) -#define PyArray_CreateSortedStridePerm \ - (*(void (*)(int, npy_intp *, npy_stride_sort_item *)) \ - PyArray_API[283]) -#define PyArray_RemoveAxesInPlace \ - (*(void (*)(PyArrayObject *, npy_bool *)) \ - PyArray_API[284]) -#define PyArray_DebugPrint \ - (*(void (*)(PyArrayObject *)) \ - PyArray_API[285]) -#define PyArray_FailUnlessWriteable \ - (*(int (*)(PyArrayObject *, const char *)) \ - PyArray_API[286]) -#define PyArray_SetUpdateIfCopyBase \ - (*(int (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[287]) -#define PyDataMem_NEW \ - (*(void * (*)(size_t)) \ - PyArray_API[288]) -#define PyDataMem_FREE \ - (*(void (*)(void *)) \ - PyArray_API[289]) -#define PyDataMem_RENEW \ - (*(void * (*)(void *, size_t)) \ - PyArray_API[290]) -#define PyDataMem_SetEventHook \ - (*(PyDataMem_EventHookFunc * (*)(PyDataMem_EventHookFunc *, void *, void **)) \ - PyArray_API[291]) -#define NPY_DEFAULT_ASSIGN_CASTING (*(NPY_CASTING *)PyArray_API[292]) - -#if !defined(NO_IMPORT_ARRAY) && !defined(NO_IMPORT) -static int -_import_array(void) -{ - int st; - PyObject *numpy = PyImport_ImportModule("numpy.core.multiarray"); - PyObject *c_api = NULL; - - if (numpy == NULL) { - PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); - return -1; - } - c_api = PyObject_GetAttrString(numpy, "_ARRAY_API"); - Py_DECREF(numpy); - if (c_api == NULL) { - PyErr_SetString(PyExc_AttributeError, "_ARRAY_API not found"); - return -1; - } - -#if PY_VERSION_HEX >= 0x03000000 - if (!PyCapsule_CheckExact(c_api)) { - PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is not PyCapsule object"); - Py_DECREF(c_api); - return -1; - } - PyArray_API = (void **)PyCapsule_GetPointer(c_api, NULL); -#else - if (!PyCObject_Check(c_api)) { - PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is not PyCObject object"); - Py_DECREF(c_api); - return -1; - } - PyArray_API = (void **)PyCObject_AsVoidPtr(c_api); -#endif - Py_DECREF(c_api); - if (PyArray_API == NULL) { - PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is NULL pointer"); - return -1; - } - - /* Perform runtime check of C API version */ - if (NPY_VERSION != PyArray_GetNDArrayCVersion()) { - PyErr_Format(PyExc_RuntimeError, "module compiled against "\ - "ABI version %x but this version of numpy is %x", \ - (int) NPY_VERSION, (int) PyArray_GetNDArrayCVersion()); - return -1; - } - if (NPY_FEATURE_VERSION > PyArray_GetNDArrayCFeatureVersion()) { - PyErr_Format(PyExc_RuntimeError, "module compiled against "\ - "API version %x but this version of numpy is %x", \ - (int) NPY_FEATURE_VERSION, (int) PyArray_GetNDArrayCFeatureVersion()); - return -1; - } - - /* - * Perform runtime check of endianness and check it matches the one set by - * the headers (npy_endian.h) as a safeguard - */ - st = PyArray_GetEndianness(); - if (st == NPY_CPU_UNKNOWN_ENDIAN) { - PyErr_Format(PyExc_RuntimeError, "FATAL: module compiled as unknown endian"); - return -1; - } -#if NPY_BYTE_ORDER == NPY_BIG_ENDIAN - if (st != NPY_CPU_BIG) { - PyErr_Format(PyExc_RuntimeError, "FATAL: module compiled as "\ - "big endian, but detected different endianness at runtime"); - return -1; - } -#elif NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN - if (st != NPY_CPU_LITTLE) { - PyErr_Format(PyExc_RuntimeError, "FATAL: module compiled as "\ - "little endian, but detected different endianness at runtime"); - return -1; - } -#endif - - return 0; -} - -#if PY_VERSION_HEX >= 0x03000000 -#define NUMPY_IMPORT_ARRAY_RETVAL NULL -#else -#define NUMPY_IMPORT_ARRAY_RETVAL -#endif - -#define import_array() {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); return NUMPY_IMPORT_ARRAY_RETVAL; } } - -#define import_array1(ret) {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); return ret; } } - -#define import_array2(msg, ret) {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, msg); return ret; } } - -#endif - -#endif diff --git a/include/numpy/__ufunc_api.h b/include/numpy/__ufunc_api.h deleted file mode 100644 index fd81d07b5..000000000 --- a/include/numpy/__ufunc_api.h +++ /dev/null @@ -1,323 +0,0 @@ - -#ifdef _UMATHMODULE - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION -extern NPY_NO_EXPORT PyTypeObject PyUFunc_Type; -#else -NPY_NO_EXPORT PyTypeObject PyUFunc_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyUFunc_Type; -#else - NPY_NO_EXPORT PyTypeObject PyUFunc_Type; -#endif - -NPY_NO_EXPORT PyObject * PyUFunc_FromFuncAndData \ - (PyUFuncGenericFunction *, void **, char *, int, int, int, int, char *, char *, int); -NPY_NO_EXPORT int PyUFunc_RegisterLoopForType \ - (PyUFuncObject *, int, PyUFuncGenericFunction, int *, void *); -NPY_NO_EXPORT int PyUFunc_GenericFunction \ - (PyUFuncObject *, PyObject *, PyObject *, PyArrayObject **); -NPY_NO_EXPORT void PyUFunc_f_f_As_d_d \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_d_d \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_f_f \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_g_g \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_F_F_As_D_D \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_F_F \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_D_D \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_G_G \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_O_O \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_ff_f_As_dd_d \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_ff_f \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_dd_d \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_gg_g \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_FF_F_As_DD_D \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_DD_D \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_FF_F \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_GG_G \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_OO_O \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_O_O_method \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_OO_O_method \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_On_Om \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT int PyUFunc_GetPyValues \ - (char *, int *, int *, PyObject **); -NPY_NO_EXPORT int PyUFunc_checkfperr \ - (int, PyObject *, int *); -NPY_NO_EXPORT void PyUFunc_clearfperr \ - (void); -NPY_NO_EXPORT int PyUFunc_getfperr \ - (void); -NPY_NO_EXPORT int PyUFunc_handlefperr \ - (int, PyObject *, int, int *); -NPY_NO_EXPORT int PyUFunc_ReplaceLoopBySignature \ - (PyUFuncObject *, PyUFuncGenericFunction, int *, PyUFuncGenericFunction *); -NPY_NO_EXPORT PyObject * PyUFunc_FromFuncAndDataAndSignature \ - (PyUFuncGenericFunction *, void **, char *, int, int, int, int, char *, char *, int, const char *); -NPY_NO_EXPORT int PyUFunc_SetUsesArraysAsData \ - (void **, size_t); -NPY_NO_EXPORT void PyUFunc_e_e \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_e_e_As_f_f \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_e_e_As_d_d \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_ee_e \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_ee_e_As_ff_f \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_ee_e_As_dd_d \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT int PyUFunc_DefaultTypeResolver \ - (PyUFuncObject *, NPY_CASTING, PyArrayObject **, PyObject *, PyArray_Descr **); -NPY_NO_EXPORT int PyUFunc_ValidateCasting \ - (PyUFuncObject *, NPY_CASTING, PyArrayObject **, PyArray_Descr **); - -#else - -#if defined(PY_UFUNC_UNIQUE_SYMBOL) -#define PyUFunc_API PY_UFUNC_UNIQUE_SYMBOL -#endif - -#if defined(NO_IMPORT) || defined(NO_IMPORT_UFUNC) -extern void **PyUFunc_API; -#else -#if defined(PY_UFUNC_UNIQUE_SYMBOL) -void **PyUFunc_API; -#else -static void **PyUFunc_API=NULL; -#endif -#endif - -#define PyUFunc_Type (*(PyTypeObject *)PyUFunc_API[0]) -#define PyUFunc_FromFuncAndData \ - (*(PyObject * (*)(PyUFuncGenericFunction *, void **, char *, int, int, int, int, char *, char *, int)) \ - PyUFunc_API[1]) -#define PyUFunc_RegisterLoopForType \ - (*(int (*)(PyUFuncObject *, int, PyUFuncGenericFunction, int *, void *)) \ - PyUFunc_API[2]) -#define PyUFunc_GenericFunction \ - (*(int (*)(PyUFuncObject *, PyObject *, PyObject *, PyArrayObject **)) \ - PyUFunc_API[3]) -#define PyUFunc_f_f_As_d_d \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[4]) -#define PyUFunc_d_d \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[5]) -#define PyUFunc_f_f \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[6]) -#define PyUFunc_g_g \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[7]) -#define PyUFunc_F_F_As_D_D \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[8]) -#define PyUFunc_F_F \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[9]) -#define PyUFunc_D_D \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[10]) -#define PyUFunc_G_G \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[11]) -#define PyUFunc_O_O \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[12]) -#define PyUFunc_ff_f_As_dd_d \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[13]) -#define PyUFunc_ff_f \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[14]) -#define PyUFunc_dd_d \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[15]) -#define PyUFunc_gg_g \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[16]) -#define PyUFunc_FF_F_As_DD_D \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[17]) -#define PyUFunc_DD_D \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[18]) -#define PyUFunc_FF_F \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[19]) -#define PyUFunc_GG_G \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[20]) -#define PyUFunc_OO_O \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[21]) -#define PyUFunc_O_O_method \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[22]) -#define PyUFunc_OO_O_method \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[23]) -#define PyUFunc_On_Om \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[24]) -#define PyUFunc_GetPyValues \ - (*(int (*)(char *, int *, int *, PyObject **)) \ - PyUFunc_API[25]) -#define PyUFunc_checkfperr \ - (*(int (*)(int, PyObject *, int *)) \ - PyUFunc_API[26]) -#define PyUFunc_clearfperr \ - (*(void (*)(void)) \ - PyUFunc_API[27]) -#define PyUFunc_getfperr \ - (*(int (*)(void)) \ - PyUFunc_API[28]) -#define PyUFunc_handlefperr \ - (*(int (*)(int, PyObject *, int, int *)) \ - PyUFunc_API[29]) -#define PyUFunc_ReplaceLoopBySignature \ - (*(int (*)(PyUFuncObject *, PyUFuncGenericFunction, int *, PyUFuncGenericFunction *)) \ - PyUFunc_API[30]) -#define PyUFunc_FromFuncAndDataAndSignature \ - (*(PyObject * (*)(PyUFuncGenericFunction *, void **, char *, int, int, int, int, char *, char *, int, const char *)) \ - PyUFunc_API[31]) -#define PyUFunc_SetUsesArraysAsData \ - (*(int (*)(void **, size_t)) \ - PyUFunc_API[32]) -#define PyUFunc_e_e \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[33]) -#define PyUFunc_e_e_As_f_f \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[34]) -#define PyUFunc_e_e_As_d_d \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[35]) -#define PyUFunc_ee_e \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[36]) -#define PyUFunc_ee_e_As_ff_f \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[37]) -#define PyUFunc_ee_e_As_dd_d \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[38]) -#define PyUFunc_DefaultTypeResolver \ - (*(int (*)(PyUFuncObject *, NPY_CASTING, PyArrayObject **, PyObject *, PyArray_Descr **)) \ - PyUFunc_API[39]) -#define PyUFunc_ValidateCasting \ - (*(int (*)(PyUFuncObject *, NPY_CASTING, PyArrayObject **, PyArray_Descr **)) \ - PyUFunc_API[40]) - -static int -_import_umath(void) -{ - PyObject *numpy = PyImport_ImportModule("numpy.core.umath"); - PyObject *c_api = NULL; - - if (numpy == NULL) { - PyErr_SetString(PyExc_ImportError, "numpy.core.umath failed to import"); - return -1; - } - c_api = PyObject_GetAttrString(numpy, "_UFUNC_API"); - Py_DECREF(numpy); - if (c_api == NULL) { - PyErr_SetString(PyExc_AttributeError, "_UFUNC_API not found"); - return -1; - } - -#if PY_VERSION_HEX >= 0x03000000 - if (!PyCapsule_CheckExact(c_api)) { - PyErr_SetString(PyExc_RuntimeError, "_UFUNC_API is not PyCapsule object"); - Py_DECREF(c_api); - return -1; - } - PyUFunc_API = (void **)PyCapsule_GetPointer(c_api, NULL); -#else - if (!PyCObject_Check(c_api)) { - PyErr_SetString(PyExc_RuntimeError, "_UFUNC_API is not PyCObject object"); - Py_DECREF(c_api); - return -1; - } - PyUFunc_API = (void **)PyCObject_AsVoidPtr(c_api); -#endif - Py_DECREF(c_api); - if (PyUFunc_API == NULL) { - PyErr_SetString(PyExc_RuntimeError, "_UFUNC_API is NULL pointer"); - return -1; - } - return 0; -} - -#if PY_VERSION_HEX >= 0x03000000 -#define NUMPY_IMPORT_UMATH_RETVAL NULL -#else -#define NUMPY_IMPORT_UMATH_RETVAL -#endif - -#define import_umath() \ - do {\ - UFUNC_NOFPE\ - if (_import_umath() < 0) {\ - PyErr_Print();\ - PyErr_SetString(PyExc_ImportError,\ - "numpy.core.umath failed to import");\ - return NUMPY_IMPORT_UMATH_RETVAL;\ - }\ - } while(0) - -#define import_umath1(ret) \ - do {\ - UFUNC_NOFPE\ - if (_import_umath() < 0) {\ - PyErr_Print();\ - PyErr_SetString(PyExc_ImportError,\ - "numpy.core.umath failed to import");\ - return ret;\ - }\ - } while(0) - -#define import_umath2(ret, msg) \ - do {\ - UFUNC_NOFPE\ - if (_import_umath() < 0) {\ - PyErr_Print();\ - PyErr_SetString(PyExc_ImportError, msg);\ - return ret;\ - }\ - } while(0) - -#define import_ufunc() \ - do {\ - UFUNC_NOFPE\ - if (_import_umath() < 0) {\ - PyErr_Print();\ - PyErr_SetString(PyExc_ImportError,\ - "numpy.core.umath failed to import");\ - }\ - } while(0) - -#endif diff --git a/include/numpy/_neighborhood_iterator_imp.h b/include/numpy/_neighborhood_iterator_imp.h deleted file mode 100644 index e8860cbc7..000000000 --- a/include/numpy/_neighborhood_iterator_imp.h +++ /dev/null @@ -1,90 +0,0 @@ -#ifndef _NPY_INCLUDE_NEIGHBORHOOD_IMP -#error You should not include this header directly -#endif -/* - * Private API (here for inline) - */ -static NPY_INLINE int -_PyArrayNeighborhoodIter_IncrCoord(PyArrayNeighborhoodIterObject* iter); - -/* - * Update to next item of the iterator - * - * Note: this simply increment the coordinates vector, last dimension - * incremented first , i.e, for dimension 3 - * ... - * -1, -1, -1 - * -1, -1, 0 - * -1, -1, 1 - * .... - * -1, 0, -1 - * -1, 0, 0 - * .... - * 0, -1, -1 - * 0, -1, 0 - * .... - */ -#define _UPDATE_COORD_ITER(c) \ - wb = iter->coordinates[c] < iter->bounds[c][1]; \ - if (wb) { \ - iter->coordinates[c] += 1; \ - return 0; \ - } \ - else { \ - iter->coordinates[c] = iter->bounds[c][0]; \ - } - -static NPY_INLINE int -_PyArrayNeighborhoodIter_IncrCoord(PyArrayNeighborhoodIterObject* iter) -{ - npy_intp i, wb; - - for (i = iter->nd - 1; i >= 0; --i) { - _UPDATE_COORD_ITER(i) - } - - return 0; -} - -/* - * Version optimized for 2d arrays, manual loop unrolling - */ -static NPY_INLINE int -_PyArrayNeighborhoodIter_IncrCoord2D(PyArrayNeighborhoodIterObject* iter) -{ - npy_intp wb; - - _UPDATE_COORD_ITER(1) - _UPDATE_COORD_ITER(0) - - return 0; -} -#undef _UPDATE_COORD_ITER - -/* - * Advance to the next neighbour - */ -static NPY_INLINE int -PyArrayNeighborhoodIter_Next(PyArrayNeighborhoodIterObject* iter) -{ - _PyArrayNeighborhoodIter_IncrCoord (iter); - iter->dataptr = iter->translate((PyArrayIterObject*)iter, iter->coordinates); - - return 0; -} - -/* - * Reset functions - */ -static NPY_INLINE int -PyArrayNeighborhoodIter_Reset(PyArrayNeighborhoodIterObject* iter) -{ - npy_intp i; - - for (i = 0; i < iter->nd; ++i) { - iter->coordinates[i] = iter->bounds[i][0]; - } - iter->dataptr = iter->translate((PyArrayIterObject*)iter, iter->coordinates); - - return 0; -} diff --git a/include/numpy/_numpyconfig.h b/include/numpy/_numpyconfig.h deleted file mode 100644 index d55ffc38d..000000000 --- a/include/numpy/_numpyconfig.h +++ /dev/null @@ -1,29 +0,0 @@ -#define NPY_SIZEOF_SHORT SIZEOF_SHORT -#define NPY_SIZEOF_INT SIZEOF_INT -#define NPY_SIZEOF_LONG SIZEOF_LONG -#define NPY_SIZEOF_FLOAT 4 -#define NPY_SIZEOF_COMPLEX_FLOAT 8 -#define NPY_SIZEOF_DOUBLE 8 -#define NPY_SIZEOF_COMPLEX_DOUBLE 16 -#define NPY_SIZEOF_LONGDOUBLE 16 -#define NPY_SIZEOF_COMPLEX_LONGDOUBLE 32 -#define NPY_SIZEOF_PY_INTPTR_T 8 -#define NPY_SIZEOF_PY_LONG_LONG 8 -#define NPY_SIZEOF_LONGLONG 8 -#define NPY_NO_SMP 0 -#define NPY_HAVE_DECL_ISNAN -#define NPY_HAVE_DECL_ISINF -#define NPY_HAVE_DECL_ISFINITE -#define NPY_HAVE_DECL_SIGNBIT -#define NPY_USE_C99_COMPLEX 1 -#define NPY_HAVE_COMPLEX_DOUBLE 1 -#define NPY_HAVE_COMPLEX_FLOAT 1 -#define NPY_HAVE_COMPLEX_LONG_DOUBLE 1 -#define NPY_USE_C99_FORMATS 1 -#define NPY_VISIBILITY_HIDDEN __attribute__((visibility("hidden"))) -#define NPY_ABI_VERSION 0x01000009 -#define NPY_API_VERSION 0x00000007 - -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS 1 -#endif diff --git a/include/numpy/arrayobject.h b/include/numpy/arrayobject.h deleted file mode 100644 index a84766f63..000000000 --- a/include/numpy/arrayobject.h +++ /dev/null @@ -1,22 +0,0 @@ - -/* This expects the following variables to be defined (besides - the usual ones from pyconfig.h - - SIZEOF_LONG_DOUBLE -- sizeof(long double) or sizeof(double) if no - long double is present on platform. - CHAR_BIT -- number of bits in a char (usually 8) - (should be in limits.h) - -*/ - -#ifndef Py_ARRAYOBJECT_H -#define Py_ARRAYOBJECT_H - -#include "ndarrayobject.h" -#include "npy_interrupt.h" - -#ifdef NPY_NO_PREFIX -#include "noprefix.h" -#endif - -#endif diff --git a/include/numpy/arrayscalars.h b/include/numpy/arrayscalars.h deleted file mode 100644 index 64450e713..000000000 --- a/include/numpy/arrayscalars.h +++ /dev/null @@ -1,175 +0,0 @@ -#ifndef _NPY_ARRAYSCALARS_H_ -#define _NPY_ARRAYSCALARS_H_ - -#ifndef _MULTIARRAYMODULE -typedef struct { - PyObject_HEAD - npy_bool obval; -} PyBoolScalarObject; -#endif - - -typedef struct { - PyObject_HEAD - signed char obval; -} PyByteScalarObject; - - -typedef struct { - PyObject_HEAD - short obval; -} PyShortScalarObject; - - -typedef struct { - PyObject_HEAD - int obval; -} PyIntScalarObject; - - -typedef struct { - PyObject_HEAD - long obval; -} PyLongScalarObject; - - -typedef struct { - PyObject_HEAD - npy_longlong obval; -} PyLongLongScalarObject; - - -typedef struct { - PyObject_HEAD - unsigned char obval; -} PyUByteScalarObject; - - -typedef struct { - PyObject_HEAD - unsigned short obval; -} PyUShortScalarObject; - - -typedef struct { - PyObject_HEAD - unsigned int obval; -} PyUIntScalarObject; - - -typedef struct { - PyObject_HEAD - unsigned long obval; -} PyULongScalarObject; - - -typedef struct { - PyObject_HEAD - npy_ulonglong obval; -} PyULongLongScalarObject; - - -typedef struct { - PyObject_HEAD - npy_half obval; -} PyHalfScalarObject; - - -typedef struct { - PyObject_HEAD - float obval; -} PyFloatScalarObject; - - -typedef struct { - PyObject_HEAD - double obval; -} PyDoubleScalarObject; - - -typedef struct { - PyObject_HEAD - npy_longdouble obval; -} PyLongDoubleScalarObject; - - -typedef struct { - PyObject_HEAD - npy_cfloat obval; -} PyCFloatScalarObject; - - -typedef struct { - PyObject_HEAD - npy_cdouble obval; -} PyCDoubleScalarObject; - - -typedef struct { - PyObject_HEAD - npy_clongdouble obval; -} PyCLongDoubleScalarObject; - - -typedef struct { - PyObject_HEAD - PyObject * obval; -} PyObjectScalarObject; - -typedef struct { - PyObject_HEAD - npy_datetime obval; - PyArray_DatetimeMetaData obmeta; -} PyDatetimeScalarObject; - -typedef struct { - PyObject_HEAD - npy_timedelta obval; - PyArray_DatetimeMetaData obmeta; -} PyTimedeltaScalarObject; - - -typedef struct { - PyObject_HEAD - char obval; -} PyScalarObject; - -#define PyStringScalarObject PyStringObject -#define PyUnicodeScalarObject PyUnicodeObject - -typedef struct { - PyObject_VAR_HEAD - char *obval; - PyArray_Descr *descr; - int flags; - PyObject *base; -} PyVoidScalarObject; - -/* Macros - PyScalarObject - PyArrType_Type - are defined in ndarrayobject.h -*/ - -#define PyArrayScalar_False ((PyObject *)(&(_PyArrayScalar_BoolValues[0]))) -#define PyArrayScalar_True ((PyObject *)(&(_PyArrayScalar_BoolValues[1]))) -#define PyArrayScalar_FromLong(i) \ - ((PyObject *)(&(_PyArrayScalar_BoolValues[((i)!=0)]))) -#define PyArrayScalar_RETURN_BOOL_FROM_LONG(i) \ - return Py_INCREF(PyArrayScalar_FromLong(i)), \ - PyArrayScalar_FromLong(i) -#define PyArrayScalar_RETURN_FALSE \ - return Py_INCREF(PyArrayScalar_False), \ - PyArrayScalar_False -#define PyArrayScalar_RETURN_TRUE \ - return Py_INCREF(PyArrayScalar_True), \ - PyArrayScalar_True - -#define PyArrayScalar_New(cls) \ - Py##cls##ArrType_Type.tp_alloc(&Py##cls##ArrType_Type, 0) -#define PyArrayScalar_VAL(obj, cls) \ - ((Py##cls##ScalarObject *)obj)->obval -#define PyArrayScalar_ASSIGN(obj, cls, val) \ - PyArrayScalar_VAL(obj, cls) = val - -#endif diff --git a/include/numpy/halffloat.h b/include/numpy/halffloat.h deleted file mode 100644 index 944f0ea34..000000000 --- a/include/numpy/halffloat.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef __NPY_HALFFLOAT_H__ -#define __NPY_HALFFLOAT_H__ - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Half-precision routines - */ - -/* Conversions */ -float npy_half_to_float(npy_half h); -double npy_half_to_double(npy_half h); -npy_half npy_float_to_half(float f); -npy_half npy_double_to_half(double d); -/* Comparisons */ -int npy_half_eq(npy_half h1, npy_half h2); -int npy_half_ne(npy_half h1, npy_half h2); -int npy_half_le(npy_half h1, npy_half h2); -int npy_half_lt(npy_half h1, npy_half h2); -int npy_half_ge(npy_half h1, npy_half h2); -int npy_half_gt(npy_half h1, npy_half h2); -/* faster *_nonan variants for when you know h1 and h2 are not NaN */ -int npy_half_eq_nonan(npy_half h1, npy_half h2); -int npy_half_lt_nonan(npy_half h1, npy_half h2); -int npy_half_le_nonan(npy_half h1, npy_half h2); -/* Miscellaneous functions */ -int npy_half_iszero(npy_half h); -int npy_half_isnan(npy_half h); -int npy_half_isinf(npy_half h); -int npy_half_isfinite(npy_half h); -int npy_half_signbit(npy_half h); -npy_half npy_half_copysign(npy_half x, npy_half y); -npy_half npy_half_spacing(npy_half h); -npy_half npy_half_nextafter(npy_half x, npy_half y); - -/* - * Half-precision constants - */ - -#define NPY_HALF_ZERO (0x0000u) -#define NPY_HALF_PZERO (0x0000u) -#define NPY_HALF_NZERO (0x8000u) -#define NPY_HALF_ONE (0x3c00u) -#define NPY_HALF_NEGONE (0xbc00u) -#define NPY_HALF_PINF (0x7c00u) -#define NPY_HALF_NINF (0xfc00u) -#define NPY_HALF_NAN (0x7e00u) - -#define NPY_MAX_HALF (0x7bffu) - -/* - * Bit-level conversions - */ - -npy_uint16 npy_floatbits_to_halfbits(npy_uint32 f); -npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d); -npy_uint32 npy_halfbits_to_floatbits(npy_uint16 h); -npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/include/numpy/multiarray_api.txt b/include/numpy/multiarray_api.txt deleted file mode 100644 index 7e588f067..000000000 --- a/include/numpy/multiarray_api.txt +++ /dev/null @@ -1,2375 +0,0 @@ - -=========== -Numpy C-API -=========== -:: - - unsigned int - PyArray_GetNDArrayCVersion(void ) - - -Included at the very first so not auto-grabbed and thus not labeled. - -:: - - int - PyArray_SetNumericOps(PyObject *dict) - -Set internal structure with number functions that all arrays will use - -:: - - PyObject * - PyArray_GetNumericOps(void ) - -Get dictionary showing number functions that all arrays will use - -:: - - int - PyArray_INCREF(PyArrayObject *mp) - -For object arrays, increment all internal references. - -:: - - int - PyArray_XDECREF(PyArrayObject *mp) - -Decrement all internal references for object arrays. -(or arrays with object fields) - -:: - - void - PyArray_SetStringFunction(PyObject *op, int repr) - -Set the array print function to be a Python function. - -:: - - PyArray_Descr * - PyArray_DescrFromType(int type) - -Get the PyArray_Descr structure for a type. - -:: - - PyObject * - PyArray_TypeObjectFromType(int type) - -Get a typeobject from a type-number -- can return NULL. - -New reference - -:: - - char * - PyArray_Zero(PyArrayObject *arr) - -Get pointer to zero of correct type for array. - -:: - - char * - PyArray_One(PyArrayObject *arr) - -Get pointer to one of correct type for array - -:: - - PyObject * - PyArray_CastToType(PyArrayObject *arr, PyArray_Descr *dtype, int - is_f_order) - -For backward compatibility - -Cast an array using typecode structure. -steals reference to at --- cannot be NULL - -This function always makes a copy of arr, even if the dtype -doesn't change. - -:: - - int - PyArray_CastTo(PyArrayObject *out, PyArrayObject *mp) - -Cast to an already created array. - -:: - - int - PyArray_CastAnyTo(PyArrayObject *out, PyArrayObject *mp) - -Cast to an already created array. Arrays don't have to be "broadcastable" -Only requirement is they have the same number of elements. - -:: - - int - PyArray_CanCastSafely(int fromtype, int totype) - -Check the type coercion rules. - -:: - - npy_bool - PyArray_CanCastTo(PyArray_Descr *from, PyArray_Descr *to) - -leaves reference count alone --- cannot be NULL - -PyArray_CanCastTypeTo is equivalent to this, but adds a 'casting' -parameter. - -:: - - int - PyArray_ObjectType(PyObject *op, int minimum_type) - -Return the typecode of the array a Python object would be converted to - -Returns the type number the result should have, or NPY_NOTYPE on error. - -:: - - PyArray_Descr * - PyArray_DescrFromObject(PyObject *op, PyArray_Descr *mintype) - -new reference -- accepts NULL for mintype - -:: - - PyArrayObject ** - PyArray_ConvertToCommonType(PyObject *op, int *retn) - - -:: - - PyArray_Descr * - PyArray_DescrFromScalar(PyObject *sc) - -Return descr object from array scalar. - -New reference - -:: - - PyArray_Descr * - PyArray_DescrFromTypeObject(PyObject *type) - - -:: - - npy_intp - PyArray_Size(PyObject *op) - -Compute the size of an array (in number of items) - -:: - - PyObject * - PyArray_Scalar(void *data, PyArray_Descr *descr, PyObject *base) - -Get scalar-equivalent to a region of memory described by a descriptor. - -:: - - PyObject * - PyArray_FromScalar(PyObject *scalar, PyArray_Descr *outcode) - -Get 0-dim array from scalar - -0-dim array from array-scalar object -always contains a copy of the data -unless outcode is NULL, it is of void type and the referrer does -not own it either. - -steals reference to outcode - -:: - - void - PyArray_ScalarAsCtype(PyObject *scalar, void *ctypeptr) - -Convert to c-type - -no error checking is performed -- ctypeptr must be same type as scalar -in case of flexible type, the data is not copied -into ctypeptr which is expected to be a pointer to pointer - -:: - - int - PyArray_CastScalarToCtype(PyObject *scalar, void - *ctypeptr, PyArray_Descr *outcode) - -Cast Scalar to c-type - -The output buffer must be large-enough to receive the value -Even for flexible types which is different from ScalarAsCtype -where only a reference for flexible types is returned - -This may not work right on narrow builds for NumPy unicode scalars. - -:: - - int - PyArray_CastScalarDirect(PyObject *scalar, PyArray_Descr - *indescr, void *ctypeptr, int outtype) - -Cast Scalar to c-type - -:: - - PyObject * - PyArray_ScalarFromObject(PyObject *object) - -Get an Array Scalar From a Python Object - -Returns NULL if unsuccessful but error is only set if another error occurred. -Currently only Numeric-like object supported. - -:: - - PyArray_VectorUnaryFunc * - PyArray_GetCastFunc(PyArray_Descr *descr, int type_num) - -Get a cast function to cast from the input descriptor to the -output type_number (must be a registered data-type). -Returns NULL if un-successful. - -:: - - PyObject * - PyArray_FromDims(int nd, int *d, int type) - -Construct an empty array from dimensions and typenum - -:: - - PyObject * - PyArray_FromDimsAndDataAndDescr(int nd, int *d, PyArray_Descr - *descr, char *data) - -Like FromDimsAndData but uses the Descr structure instead of typecode -as input. - -:: - - PyObject * - PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int - min_depth, int max_depth, int flags, PyObject - *context) - -Does not check for NPY_ARRAY_ENSURECOPY and NPY_ARRAY_NOTSWAPPED in flags -Steals a reference to newtype --- which can be NULL - -:: - - PyObject * - PyArray_EnsureArray(PyObject *op) - -This is a quick wrapper around PyArray_FromAny(op, NULL, 0, 0, ENSUREARRAY) -that special cases Arrays and PyArray_Scalars up front -It *steals a reference* to the object -It also guarantees that the result is PyArray_Type -Because it decrefs op if any conversion needs to take place -so it can be used like PyArray_EnsureArray(some_function(...)) - -:: - - PyObject * - PyArray_EnsureAnyArray(PyObject *op) - - -:: - - PyObject * - PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, npy_intp num, char - *sep) - - -Given a ``FILE *`` pointer ``fp``, and a ``PyArray_Descr``, return an -array corresponding to the data encoded in that file. - -If the dtype is NULL, the default array type is used (double). -If non-null, the reference is stolen. - -The number of elements to read is given as ``num``; if it is < 0, then -then as many as possible are read. - -If ``sep`` is NULL or empty, then binary data is assumed, else -text data, with ``sep`` as the separator between elements. Whitespace in -the separator matches any length of whitespace in the text, and a match -for whitespace around the separator is added. - -For memory-mapped files, use the buffer interface. No more data than -necessary is read by this routine. - -:: - - PyObject * - PyArray_FromString(char *data, npy_intp slen, PyArray_Descr - *dtype, npy_intp num, char *sep) - - -Given a pointer to a string ``data``, a string length ``slen``, and -a ``PyArray_Descr``, return an array corresponding to the data -encoded in that string. - -If the dtype is NULL, the default array type is used (double). -If non-null, the reference is stolen. - -If ``slen`` is < 0, then the end of string is used for text data. -It is an error for ``slen`` to be < 0 for binary data (since embedded NULLs -would be the norm). - -The number of elements to read is given as ``num``; if it is < 0, then -then as many as possible are read. - -If ``sep`` is NULL or empty, then binary data is assumed, else -text data, with ``sep`` as the separator between elements. Whitespace in -the separator matches any length of whitespace in the text, and a match -for whitespace around the separator is added. - -:: - - PyObject * - PyArray_FromBuffer(PyObject *buf, PyArray_Descr *type, npy_intp - count, npy_intp offset) - - -:: - - PyObject * - PyArray_FromIter(PyObject *obj, PyArray_Descr *dtype, npy_intp count) - - -steals a reference to dtype (which cannot be NULL) - -:: - - PyObject * - PyArray_Return(PyArrayObject *mp) - - -Return either an array or the appropriate Python object if the array -is 0d and matches a Python type. - -:: - - PyObject * - PyArray_GetField(PyArrayObject *self, PyArray_Descr *typed, int - offset) - -Get a subset of bytes from each element of the array - -:: - - int - PyArray_SetField(PyArrayObject *self, PyArray_Descr *dtype, int - offset, PyObject *val) - -Set a subset of bytes from each element of the array - -:: - - PyObject * - PyArray_Byteswap(PyArrayObject *self, npy_bool inplace) - - -:: - - PyObject * - PyArray_Resize(PyArrayObject *self, PyArray_Dims *newshape, int - refcheck, NPY_ORDER order) - -Resize (reallocate data). Only works if nothing else is referencing this -array and it is contiguous. If refcheck is 0, then the reference count is -not checked and assumed to be 1. You still must own this data and have no -weak-references and no base object. - -:: - - int - PyArray_MoveInto(PyArrayObject *dst, PyArrayObject *src) - -Move the memory of one array into another, allowing for overlapping data. - -Returns 0 on success, negative on failure. - -:: - - int - PyArray_CopyInto(PyArrayObject *dst, PyArrayObject *src) - -Copy an Array into another array. -Broadcast to the destination shape if necessary. - -Returns 0 on success, -1 on failure. - -:: - - int - PyArray_CopyAnyInto(PyArrayObject *dst, PyArrayObject *src) - -Copy an Array into another array -- memory must not overlap -Does not require src and dest to have "broadcastable" shapes -(only the same number of elements). - -TODO: For NumPy 2.0, this could accept an order parameter which -only allows NPY_CORDER and NPY_FORDER. Could also rename -this to CopyAsFlat to make the name more intuitive. - -Returns 0 on success, -1 on error. - -:: - - int - PyArray_CopyObject(PyArrayObject *dest, PyObject *src_object) - - -:: - - PyObject * - PyArray_NewCopy(PyArrayObject *obj, NPY_ORDER order) - -Copy an array. - -:: - - PyObject * - PyArray_ToList(PyArrayObject *self) - -To List - -:: - - PyObject * - PyArray_ToString(PyArrayObject *self, NPY_ORDER order) - - -:: - - int - PyArray_ToFile(PyArrayObject *self, FILE *fp, char *sep, char *format) - -To File - -:: - - int - PyArray_Dump(PyObject *self, PyObject *file, int protocol) - - -:: - - PyObject * - PyArray_Dumps(PyObject *self, int protocol) - - -:: - - int - PyArray_ValidType(int type) - -Is the typenum valid? - -:: - - void - PyArray_UpdateFlags(PyArrayObject *ret, int flagmask) - -Update Several Flags at once. - -:: - - PyObject * - PyArray_New(PyTypeObject *subtype, int nd, npy_intp *dims, int - type_num, npy_intp *strides, void *data, int itemsize, int - flags, PyObject *obj) - -Generic new array creation routine. - -:: - - PyObject * - PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int - nd, npy_intp *dims, npy_intp *strides, void - *data, int flags, PyObject *obj) - -Generic new array creation routine. - -steals a reference to descr (even on failure) - -:: - - PyArray_Descr * - PyArray_DescrNew(PyArray_Descr *base) - -base cannot be NULL - -:: - - PyArray_Descr * - PyArray_DescrNewFromType(int type_num) - - -:: - - double - PyArray_GetPriority(PyObject *obj, double default_) - -Get Priority from object - -:: - - PyObject * - PyArray_IterNew(PyObject *obj) - -Get Iterator. - -:: - - PyObject * - PyArray_MultiIterNew(int n, ... ) - -Get MultiIterator, - -:: - - int - PyArray_PyIntAsInt(PyObject *o) - - -:: - - npy_intp - PyArray_PyIntAsIntp(PyObject *o) - - -:: - - int - PyArray_Broadcast(PyArrayMultiIterObject *mit) - - -:: - - void - PyArray_FillObjectArray(PyArrayObject *arr, PyObject *obj) - -Assumes contiguous - -:: - - int - PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj) - - -:: - - npy_bool - PyArray_CheckStrides(int elsize, int nd, npy_intp numbytes, npy_intp - offset, npy_intp *dims, npy_intp *newstrides) - - -:: - - PyArray_Descr * - PyArray_DescrNewByteorder(PyArray_Descr *self, char newendian) - - -returns a copy of the PyArray_Descr structure with the byteorder -altered: -no arguments: The byteorder is swapped (in all subfields as well) -single argument: The byteorder is forced to the given state -(in all subfields as well) - -Valid states: ('big', '>') or ('little' or '<') -('native', or '=') - -If a descr structure with | is encountered it's own -byte-order is not changed but any fields are: - - -Deep bytorder change of a data-type descriptor -Leaves reference count of self unchanged --- does not DECREF self *** - -:: - - PyObject * - PyArray_IterAllButAxis(PyObject *obj, int *inaxis) - -Get Iterator that iterates over all but one axis (don't use this with -PyArray_ITER_GOTO1D). The axis will be over-written if negative -with the axis having the smallest stride. - -:: - - PyObject * - PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int - min_depth, int max_depth, int requires, PyObject - *context) - -steals a reference to descr -- accepts NULL - -:: - - PyObject * - PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int - flags) - -steals reference to newtype --- acc. NULL - -:: - - PyObject * - PyArray_FromInterface(PyObject *origin) - - -:: - - PyObject * - PyArray_FromStructInterface(PyObject *input) - - -:: - - PyObject * - PyArray_FromArrayAttr(PyObject *op, PyArray_Descr *typecode, PyObject - *context) - - -:: - - NPY_SCALARKIND - PyArray_ScalarKind(int typenum, PyArrayObject **arr) - -ScalarKind - -Returns the scalar kind of a type number, with an -optional tweak based on the scalar value itself. -If no scalar is provided, it returns INTPOS_SCALAR -for both signed and unsigned integers, otherwise -it checks the sign of any signed integer to choose -INTNEG_SCALAR when appropriate. - -:: - - int - PyArray_CanCoerceScalar(int thistype, int neededtype, NPY_SCALARKIND - scalar) - - -Determines whether the data type 'thistype', with -scalar kind 'scalar', can be coerced into 'neededtype'. - -:: - - PyObject * - PyArray_NewFlagsObject(PyObject *obj) - - -Get New ArrayFlagsObject - -:: - - npy_bool - PyArray_CanCastScalar(PyTypeObject *from, PyTypeObject *to) - -See if array scalars can be cast. - -TODO: For NumPy 2.0, add a NPY_CASTING parameter. - -:: - - int - PyArray_CompareUCS4(npy_ucs4 *s1, npy_ucs4 *s2, size_t len) - - -:: - - int - PyArray_RemoveSmallest(PyArrayMultiIterObject *multi) - -Adjusts previously broadcasted iterators so that the axis with -the smallest sum of iterator strides is not iterated over. -Returns dimension which is smallest in the range [0,multi->nd). -A -1 is returned if multi->nd == 0. - -don't use with PyArray_ITER_GOTO1D because factors are not adjusted - -:: - - int - PyArray_ElementStrides(PyObject *obj) - - -:: - - void - PyArray_Item_INCREF(char *data, PyArray_Descr *descr) - - -:: - - void - PyArray_Item_XDECREF(char *data, PyArray_Descr *descr) - - -:: - - PyObject * - PyArray_FieldNames(PyObject *fields) - -Return the tuple of ordered field names from a dictionary. - -:: - - PyObject * - PyArray_Transpose(PyArrayObject *ap, PyArray_Dims *permute) - -Return Transpose. - -:: - - PyObject * - PyArray_TakeFrom(PyArrayObject *self0, PyObject *indices0, int - axis, PyArrayObject *out, NPY_CLIPMODE clipmode) - -Take - -:: - - PyObject * - PyArray_PutTo(PyArrayObject *self, PyObject*values0, PyObject - *indices0, NPY_CLIPMODE clipmode) - -Put values into an array - -:: - - PyObject * - PyArray_PutMask(PyArrayObject *self, PyObject*values0, PyObject*mask0) - -Put values into an array according to a mask. - -:: - - PyObject * - PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis) - -Repeat the array. - -:: - - PyObject * - PyArray_Choose(PyArrayObject *ip, PyObject *op, PyArrayObject - *out, NPY_CLIPMODE clipmode) - - -:: - - int - PyArray_Sort(PyArrayObject *op, int axis, NPY_SORTKIND which) - -Sort an array in-place - -:: - - PyObject * - PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which) - -ArgSort an array - -:: - - PyObject * - PyArray_SearchSorted(PyArrayObject *op1, PyObject *op2, NPY_SEARCHSIDE - side, PyObject *perm) - - -Search the sorted array op1 for the location of the items in op2. The -result is an array of indexes, one for each element in op2, such that if -the item were to be inserted in op1 just before that index the array -would still be in sorted order. - -Parameters ----------- -op1 : PyArrayObject * -Array to be searched, must be 1-D. -op2 : PyObject * -Array of items whose insertion indexes in op1 are wanted -side : {NPY_SEARCHLEFT, NPY_SEARCHRIGHT} -If NPY_SEARCHLEFT, return first valid insertion indexes -If NPY_SEARCHRIGHT, return last valid insertion indexes -perm : PyObject * -Permutation array that sorts op1 (optional) - -Returns -------- -ret : PyObject * -New reference to npy_intp array containing indexes where items in op2 -could be validly inserted into op1. NULL on error. - -Notes ------ -Binary search is used to find the indexes. - -:: - - PyObject * - PyArray_ArgMax(PyArrayObject *op, int axis, PyArrayObject *out) - -ArgMax - -:: - - PyObject * - PyArray_ArgMin(PyArrayObject *op, int axis, PyArrayObject *out) - -ArgMin - -:: - - PyObject * - PyArray_Reshape(PyArrayObject *self, PyObject *shape) - -Reshape - -:: - - PyObject * - PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims, NPY_ORDER - order) - -New shape for an array - -:: - - PyObject * - PyArray_Squeeze(PyArrayObject *self) - - -return a new view of the array object with all of its unit-length -dimensions squeezed out if needed, otherwise -return the same array. - -:: - - PyObject * - PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject - *pytype) - -View -steals a reference to type -- accepts NULL - -:: - - PyObject * - PyArray_SwapAxes(PyArrayObject *ap, int a1, int a2) - -SwapAxes - -:: - - PyObject * - PyArray_Max(PyArrayObject *ap, int axis, PyArrayObject *out) - -Max - -:: - - PyObject * - PyArray_Min(PyArrayObject *ap, int axis, PyArrayObject *out) - -Min - -:: - - PyObject * - PyArray_Ptp(PyArrayObject *ap, int axis, PyArrayObject *out) - -Ptp - -:: - - PyObject * - PyArray_Mean(PyArrayObject *self, int axis, int rtype, PyArrayObject - *out) - -Mean - -:: - - PyObject * - PyArray_Trace(PyArrayObject *self, int offset, int axis1, int - axis2, int rtype, PyArrayObject *out) - -Trace - -:: - - PyObject * - PyArray_Diagonal(PyArrayObject *self, int offset, int axis1, int - axis2) - -Diagonal - -In NumPy versions prior to 1.7, this function always returned a copy of -the diagonal array. In 1.7, the code has been updated to compute a view -onto 'self', but it still copies this array before returning, as well as -setting the internal WARN_ON_WRITE flag. In a future version, it will -simply return a view onto self. - -:: - - PyObject * - PyArray_Clip(PyArrayObject *self, PyObject *min, PyObject - *max, PyArrayObject *out) - -Clip - -:: - - PyObject * - PyArray_Conjugate(PyArrayObject *self, PyArrayObject *out) - -Conjugate - -:: - - PyObject * - PyArray_Nonzero(PyArrayObject *self) - -Nonzero - -TODO: In NumPy 2.0, should make the iteration order a parameter. - -:: - - PyObject * - PyArray_Std(PyArrayObject *self, int axis, int rtype, PyArrayObject - *out, int variance) - -Set variance to 1 to by-pass square-root calculation and return variance -Std - -:: - - PyObject * - PyArray_Sum(PyArrayObject *self, int axis, int rtype, PyArrayObject - *out) - -Sum - -:: - - PyObject * - PyArray_CumSum(PyArrayObject *self, int axis, int rtype, PyArrayObject - *out) - -CumSum - -:: - - PyObject * - PyArray_Prod(PyArrayObject *self, int axis, int rtype, PyArrayObject - *out) - -Prod - -:: - - PyObject * - PyArray_CumProd(PyArrayObject *self, int axis, int - rtype, PyArrayObject *out) - -CumProd - -:: - - PyObject * - PyArray_All(PyArrayObject *self, int axis, PyArrayObject *out) - -All - -:: - - PyObject * - PyArray_Any(PyArrayObject *self, int axis, PyArrayObject *out) - -Any - -:: - - PyObject * - PyArray_Compress(PyArrayObject *self, PyObject *condition, int - axis, PyArrayObject *out) - -Compress - -:: - - PyObject * - PyArray_Flatten(PyArrayObject *a, NPY_ORDER order) - -Flatten - -:: - - PyObject * - PyArray_Ravel(PyArrayObject *arr, NPY_ORDER order) - -Ravel -Returns a contiguous array - -:: - - npy_intp - PyArray_MultiplyList(npy_intp *l1, int n) - -Multiply a List - -:: - - int - PyArray_MultiplyIntList(int *l1, int n) - -Multiply a List of ints - -:: - - void * - PyArray_GetPtr(PyArrayObject *obj, npy_intp*ind) - -Produce a pointer into array - -:: - - int - PyArray_CompareLists(npy_intp *l1, npy_intp *l2, int n) - -Compare Lists - -:: - - int - PyArray_AsCArray(PyObject **op, void *ptr, npy_intp *dims, int - nd, PyArray_Descr*typedescr) - -Simulate a C-array -steals a reference to typedescr -- can be NULL - -:: - - int - PyArray_As1D(PyObject **op, char **ptr, int *d1, int typecode) - -Convert to a 1D C-array - -:: - - int - PyArray_As2D(PyObject **op, char ***ptr, int *d1, int *d2, int - typecode) - -Convert to a 2D C-array - -:: - - int - PyArray_Free(PyObject *op, void *ptr) - -Free pointers created if As2D is called - -:: - - int - PyArray_Converter(PyObject *object, PyObject **address) - - -Useful to pass as converter function for O& processing in PyArgs_ParseTuple. - -This conversion function can be used with the "O&" argument for -PyArg_ParseTuple. It will immediately return an object of array type -or will convert to a NPY_ARRAY_CARRAY any other object. - -If you use PyArray_Converter, you must DECREF the array when finished -as you get a new reference to it. - -:: - - int - PyArray_IntpFromSequence(PyObject *seq, npy_intp *vals, int maxvals) - -PyArray_IntpFromSequence -Returns the number of dimensions or -1 if an error occurred. -vals must be large enough to hold maxvals - -:: - - PyObject * - PyArray_Concatenate(PyObject *op, int axis) - -Concatenate - -Concatenate an arbitrary Python sequence into an array. -op is a python object supporting the sequence interface. -Its elements will be concatenated together to form a single -multidimensional array. If axis is NPY_MAXDIMS or bigger, then -each sequence object will be flattened before concatenation - -:: - - PyObject * - PyArray_InnerProduct(PyObject *op1, PyObject *op2) - -Numeric.innerproduct(a,v) - -:: - - PyObject * - PyArray_MatrixProduct(PyObject *op1, PyObject *op2) - -Numeric.matrixproduct(a,v) -just like inner product but does the swapaxes stuff on the fly - -:: - - PyObject * - PyArray_CopyAndTranspose(PyObject *op) - -Copy and Transpose - -Could deprecate this function, as there isn't a speed benefit over -calling Transpose and then Copy. - -:: - - PyObject * - PyArray_Correlate(PyObject *op1, PyObject *op2, int mode) - -Numeric.correlate(a1,a2,mode) - -:: - - int - PyArray_TypestrConvert(int itemsize, int gentype) - -Typestr converter - -:: - - int - PyArray_DescrConverter(PyObject *obj, PyArray_Descr **at) - -Get typenum from an object -- None goes to NPY_DEFAULT_TYPE -This function takes a Python object representing a type and converts it -to a the correct PyArray_Descr * structure to describe the type. - -Many objects can be used to represent a data-type which in NumPy is -quite a flexible concept. - -This is the central code that converts Python objects to -Type-descriptor objects that are used throughout numpy. - -Returns a new reference in *at, but the returned should not be -modified as it may be one of the canonical immutable objects or -a reference to the input obj. - -:: - - int - PyArray_DescrConverter2(PyObject *obj, PyArray_Descr **at) - -Get typenum from an object -- None goes to NULL - -:: - - int - PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq) - -Get intp chunk from sequence - -This function takes a Python sequence object and allocates and -fills in an intp array with the converted values. - -Remember to free the pointer seq.ptr when done using -PyDimMem_FREE(seq.ptr)** - -:: - - int - PyArray_BufferConverter(PyObject *obj, PyArray_Chunk *buf) - -Get buffer chunk from object - -this function takes a Python object which exposes the (single-segment) -buffer interface and returns a pointer to the data segment - -You should increment the reference count by one of buf->base -if you will hang on to a reference - -You only get a borrowed reference to the object. Do not free the -memory... - -:: - - int - PyArray_AxisConverter(PyObject *obj, int *axis) - -Get axis from an object (possibly None) -- a converter function, - -See also PyArray_ConvertMultiAxis, which also handles a tuple of axes. - -:: - - int - PyArray_BoolConverter(PyObject *object, npy_bool *val) - -Convert an object to true / false - -:: - - int - PyArray_ByteorderConverter(PyObject *obj, char *endian) - -Convert object to endian - -:: - - int - PyArray_OrderConverter(PyObject *object, NPY_ORDER *val) - -Convert an object to FORTRAN / C / ANY / KEEP - -:: - - unsigned char - PyArray_EquivTypes(PyArray_Descr *type1, PyArray_Descr *type2) - - -This function returns true if the two typecodes are -equivalent (same basic kind and same itemsize). - -:: - - PyObject * - PyArray_Zeros(int nd, npy_intp *dims, PyArray_Descr *type, int - is_f_order) - -Zeros - -steal a reference -accepts NULL type - -:: - - PyObject * - PyArray_Empty(int nd, npy_intp *dims, PyArray_Descr *type, int - is_f_order) - -Empty - -accepts NULL type -steals referenct to type - -:: - - PyObject * - PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) - -Where - -:: - - PyObject * - PyArray_Arange(double start, double stop, double step, int type_num) - -Arange, - -:: - - PyObject * - PyArray_ArangeObj(PyObject *start, PyObject *stop, PyObject - *step, PyArray_Descr *dtype) - - -ArangeObj, - -this doesn't change the references - -:: - - int - PyArray_SortkindConverter(PyObject *obj, NPY_SORTKIND *sortkind) - -Convert object to sort kind - -:: - - PyObject * - PyArray_LexSort(PyObject *sort_keys, int axis) - -LexSort an array providing indices that will sort a collection of arrays -lexicographically. The first key is sorted on first, followed by the second key --- requires that arg"merge"sort is available for each sort_key - -Returns an index array that shows the indexes for the lexicographic sort along -the given axis. - -:: - - PyObject * - PyArray_Round(PyArrayObject *a, int decimals, PyArrayObject *out) - -Round - -:: - - unsigned char - PyArray_EquivTypenums(int typenum1, int typenum2) - - -:: - - int - PyArray_RegisterDataType(PyArray_Descr *descr) - -Register Data type -Does not change the reference count of descr - -:: - - int - PyArray_RegisterCastFunc(PyArray_Descr *descr, int - totype, PyArray_VectorUnaryFunc *castfunc) - -Register Casting Function -Replaces any function currently stored. - -:: - - int - PyArray_RegisterCanCast(PyArray_Descr *descr, int - totype, NPY_SCALARKIND scalar) - -Register a type number indicating that a descriptor can be cast -to it safely - -:: - - void - PyArray_InitArrFuncs(PyArray_ArrFuncs *f) - -Initialize arrfuncs to NULL - -:: - - PyObject * - PyArray_IntTupleFromIntp(int len, npy_intp *vals) - -PyArray_IntTupleFromIntp - -:: - - int - PyArray_TypeNumFromName(char *str) - - -:: - - int - PyArray_ClipmodeConverter(PyObject *object, NPY_CLIPMODE *val) - -Convert an object to NPY_RAISE / NPY_CLIP / NPY_WRAP - -:: - - int - PyArray_OutputConverter(PyObject *object, PyArrayObject **address) - -Useful to pass as converter function for O& processing in -PyArgs_ParseTuple for output arrays - -:: - - PyObject * - PyArray_BroadcastToShape(PyObject *obj, npy_intp *dims, int nd) - -Get Iterator broadcast to a particular shape - -:: - - void - _PyArray_SigintHandler(int signum) - - -:: - - void* - _PyArray_GetSigintBuf(void ) - - -:: - - int - PyArray_DescrAlignConverter(PyObject *obj, PyArray_Descr **at) - - -Get type-descriptor from an object forcing alignment if possible -None goes to DEFAULT type. - -any object with the .fields attribute and/or .itemsize attribute (if the -.fields attribute does not give the total size -- i.e. a partial record -naming). If itemsize is given it must be >= size computed from fields - -The .fields attribute must return a convertible dictionary if present. -Result inherits from NPY_VOID. - -:: - - int - PyArray_DescrAlignConverter2(PyObject *obj, PyArray_Descr **at) - - -Get type-descriptor from an object forcing alignment if possible -None goes to NULL. - -:: - - int - PyArray_SearchsideConverter(PyObject *obj, void *addr) - -Convert object to searchsorted side - -:: - - PyObject * - PyArray_CheckAxis(PyArrayObject *arr, int *axis, int flags) - -PyArray_CheckAxis - -check that axis is valid -convert 0-d arrays to 1-d arrays - -:: - - npy_intp - PyArray_OverflowMultiplyList(npy_intp *l1, int n) - -Multiply a List of Non-negative numbers with over-flow detection. - -:: - - int - PyArray_CompareString(char *s1, char *s2, size_t len) - - -:: - - PyObject * - PyArray_MultiIterFromObjects(PyObject **mps, int n, int nadd, ... ) - -Get MultiIterator from array of Python objects and any additional - -PyObject **mps -- array of PyObjects -int n - number of PyObjects in the array -int nadd - number of additional arrays to include in the iterator. - -Returns a multi-iterator object. - -:: - - int - PyArray_GetEndianness(void ) - - -:: - - unsigned int - PyArray_GetNDArrayCFeatureVersion(void ) - -Returns the built-in (at compilation time) C API version - -:: - - PyObject * - PyArray_Correlate2(PyObject *op1, PyObject *op2, int mode) - -correlate(a1,a2,mode) - -This function computes the usual correlation (correlate(a1, a2) != -correlate(a2, a1), and conjugate the second argument for complex inputs - -:: - - PyObject* - PyArray_NeighborhoodIterNew(PyArrayIterObject *x, npy_intp - *bounds, int mode, PyArrayObject*fill) - -A Neighborhood Iterator object. - -:: - - void - PyArray_SetDatetimeParseFunction(PyObject *op) - -This function is scheduled to be removed - -TO BE REMOVED - NOT USED INTERNALLY. - -:: - - void - PyArray_DatetimeToDatetimeStruct(npy_datetime val, NPY_DATETIMEUNIT - fr, npy_datetimestruct *result) - -Fill the datetime struct from the value and resolution unit. - -TO BE REMOVED - NOT USED INTERNALLY. - -:: - - void - PyArray_TimedeltaToTimedeltaStruct(npy_timedelta val, NPY_DATETIMEUNIT - fr, npy_timedeltastruct *result) - -Fill the timedelta struct from the timedelta value and resolution unit. - -TO BE REMOVED - NOT USED INTERNALLY. - -:: - - npy_datetime - PyArray_DatetimeStructToDatetime(NPY_DATETIMEUNIT - fr, npy_datetimestruct *d) - -Create a datetime value from a filled datetime struct and resolution unit. - -TO BE REMOVED - NOT USED INTERNALLY. - -:: - - npy_datetime - PyArray_TimedeltaStructToTimedelta(NPY_DATETIMEUNIT - fr, npy_timedeltastruct *d) - -Create a timdelta value from a filled timedelta struct and resolution unit. - -TO BE REMOVED - NOT USED INTERNALLY. - -:: - - NpyIter * - NpyIter_New(PyArrayObject *op, npy_uint32 flags, NPY_ORDER - order, NPY_CASTING casting, PyArray_Descr*dtype) - -Allocate a new iterator for one array object. - -:: - - NpyIter * - NpyIter_MultiNew(int nop, PyArrayObject **op_in, npy_uint32 - flags, NPY_ORDER order, NPY_CASTING - casting, npy_uint32 *op_flags, PyArray_Descr - **op_request_dtypes) - -Allocate a new iterator for more than one array object, using -standard NumPy broadcasting rules and the default buffer size. - -:: - - NpyIter * - NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 - flags, NPY_ORDER order, NPY_CASTING - casting, npy_uint32 *op_flags, PyArray_Descr - **op_request_dtypes, int oa_ndim, int - **op_axes, npy_intp *itershape, npy_intp - buffersize) - -Allocate a new iterator for multiple array objects, and advanced -options for controlling the broadcasting, shape, and buffer size. - -:: - - NpyIter * - NpyIter_Copy(NpyIter *iter) - -Makes a copy of the iterator - -:: - - int - NpyIter_Deallocate(NpyIter *iter) - -Deallocate an iterator - -:: - - npy_bool - NpyIter_HasDelayedBufAlloc(NpyIter *iter) - -Whether the buffer allocation is being delayed - -:: - - npy_bool - NpyIter_HasExternalLoop(NpyIter *iter) - -Whether the iterator handles the inner loop - -:: - - int - NpyIter_EnableExternalLoop(NpyIter *iter) - -Removes the inner loop handling (so HasExternalLoop returns true) - -:: - - npy_intp * - NpyIter_GetInnerStrideArray(NpyIter *iter) - -Get the array of strides for the inner loop (when HasExternalLoop is true) - -This function may be safely called without holding the Python GIL. - -:: - - npy_intp * - NpyIter_GetInnerLoopSizePtr(NpyIter *iter) - -Get a pointer to the size of the inner loop (when HasExternalLoop is true) - -This function may be safely called without holding the Python GIL. - -:: - - int - NpyIter_Reset(NpyIter *iter, char **errmsg) - -Resets the iterator to its initial state - -If errmsg is non-NULL, it should point to a variable which will -receive the error message, and no Python exception will be set. -This is so that the function can be called from code not holding -the GIL. - -:: - - int - NpyIter_ResetBasePointers(NpyIter *iter, char **baseptrs, char - **errmsg) - -Resets the iterator to its initial state, with new base data pointers. -This function requires great caution. - -If errmsg is non-NULL, it should point to a variable which will -receive the error message, and no Python exception will be set. -This is so that the function can be called from code not holding -the GIL. - -:: - - int - NpyIter_ResetToIterIndexRange(NpyIter *iter, npy_intp istart, npy_intp - iend, char **errmsg) - -Resets the iterator to a new iterator index range - -If errmsg is non-NULL, it should point to a variable which will -receive the error message, and no Python exception will be set. -This is so that the function can be called from code not holding -the GIL. - -:: - - int - NpyIter_GetNDim(NpyIter *iter) - -Gets the number of dimensions being iterated - -:: - - int - NpyIter_GetNOp(NpyIter *iter) - -Gets the number of operands being iterated - -:: - - NpyIter_IterNextFunc * - NpyIter_GetIterNext(NpyIter *iter, char **errmsg) - -Compute the specialized iteration function for an iterator - -If errmsg is non-NULL, it should point to a variable which will -receive the error message, and no Python exception will be set. -This is so that the function can be called from code not holding -the GIL. - -:: - - npy_intp - NpyIter_GetIterSize(NpyIter *iter) - -Gets the number of elements being iterated - -:: - - void - NpyIter_GetIterIndexRange(NpyIter *iter, npy_intp *istart, npy_intp - *iend) - -Gets the range of iteration indices being iterated - -:: - - npy_intp - NpyIter_GetIterIndex(NpyIter *iter) - -Gets the current iteration index - -:: - - int - NpyIter_GotoIterIndex(NpyIter *iter, npy_intp iterindex) - -Sets the iterator position to the specified iterindex, -which matches the iteration order of the iterator. - -Returns NPY_SUCCEED on success, NPY_FAIL on failure. - -:: - - npy_bool - NpyIter_HasMultiIndex(NpyIter *iter) - -Whether the iterator is tracking a multi-index - -:: - - int - NpyIter_GetShape(NpyIter *iter, npy_intp *outshape) - -Gets the broadcast shape if a multi-index is being tracked by the iterator, -otherwise gets the shape of the iteration as Fortran-order -(fastest-changing index first). - -The reason Fortran-order is returned when a multi-index -is not enabled is that this is providing a direct view into how -the iterator traverses the n-dimensional space. The iterator organizes -its memory from fastest index to slowest index, and when -a multi-index is enabled, it uses a permutation to recover the original -order. - -Returns NPY_SUCCEED or NPY_FAIL. - -:: - - NpyIter_GetMultiIndexFunc * - NpyIter_GetGetMultiIndex(NpyIter *iter, char **errmsg) - -Compute a specialized get_multi_index function for the iterator - -If errmsg is non-NULL, it should point to a variable which will -receive the error message, and no Python exception will be set. -This is so that the function can be called from code not holding -the GIL. - -:: - - int - NpyIter_GotoMultiIndex(NpyIter *iter, npy_intp *multi_index) - -Sets the iterator to the specified multi-index, which must have the -correct number of entries for 'ndim'. It is only valid -when NPY_ITER_MULTI_INDEX was passed to the constructor. This operation -fails if the multi-index is out of bounds. - -Returns NPY_SUCCEED on success, NPY_FAIL on failure. - -:: - - int - NpyIter_RemoveMultiIndex(NpyIter *iter) - -Removes multi-index support from an iterator. - -Returns NPY_SUCCEED or NPY_FAIL. - -:: - - npy_bool - NpyIter_HasIndex(NpyIter *iter) - -Whether the iterator is tracking an index - -:: - - npy_bool - NpyIter_IsBuffered(NpyIter *iter) - -Whether the iterator is buffered - -:: - - npy_bool - NpyIter_IsGrowInner(NpyIter *iter) - -Whether the inner loop can grow if buffering is unneeded - -:: - - npy_intp - NpyIter_GetBufferSize(NpyIter *iter) - -Gets the size of the buffer, or 0 if buffering is not enabled - -:: - - npy_intp * - NpyIter_GetIndexPtr(NpyIter *iter) - -Get a pointer to the index, if it is being tracked - -:: - - int - NpyIter_GotoIndex(NpyIter *iter, npy_intp flat_index) - -If the iterator is tracking an index, sets the iterator -to the specified index. - -Returns NPY_SUCCEED on success, NPY_FAIL on failure. - -:: - - char ** - NpyIter_GetDataPtrArray(NpyIter *iter) - -Get the array of data pointers (1 per object being iterated) - -This function may be safely called without holding the Python GIL. - -:: - - PyArray_Descr ** - NpyIter_GetDescrArray(NpyIter *iter) - -Get the array of data type pointers (1 per object being iterated) - -:: - - PyArrayObject ** - NpyIter_GetOperandArray(NpyIter *iter) - -Get the array of objects being iterated - -:: - - PyArrayObject * - NpyIter_GetIterView(NpyIter *iter, npy_intp i) - -Returns a view to the i-th object with the iterator's internal axes - -:: - - void - NpyIter_GetReadFlags(NpyIter *iter, char *outreadflags) - -Gets an array of read flags (1 per object being iterated) - -:: - - void - NpyIter_GetWriteFlags(NpyIter *iter, char *outwriteflags) - -Gets an array of write flags (1 per object being iterated) - -:: - - void - NpyIter_DebugPrint(NpyIter *iter) - -For debugging - -:: - - npy_bool - NpyIter_IterationNeedsAPI(NpyIter *iter) - -Whether the iteration loop, and in particular the iternext() -function, needs API access. If this is true, the GIL must -be retained while iterating. - -:: - - void - NpyIter_GetInnerFixedStrideArray(NpyIter *iter, npy_intp *out_strides) - -Get an array of strides which are fixed. Any strides which may -change during iteration receive the value NPY_MAX_INTP. Once -the iterator is ready to iterate, call this to get the strides -which will always be fixed in the inner loop, then choose optimized -inner loop functions which take advantage of those fixed strides. - -This function may be safely called without holding the Python GIL. - -:: - - int - NpyIter_RemoveAxis(NpyIter *iter, int axis) - -Removes an axis from iteration. This requires that NPY_ITER_MULTI_INDEX -was set for iterator creation, and does not work if buffering is -enabled. This function also resets the iterator to its initial state. - -Returns NPY_SUCCEED or NPY_FAIL. - -:: - - npy_intp * - NpyIter_GetAxisStrideArray(NpyIter *iter, int axis) - -Gets the array of strides for the specified axis. -If the iterator is tracking a multi-index, gets the strides -for the axis specified, otherwise gets the strides for -the iteration axis as Fortran order (fastest-changing axis first). - -Returns NULL if an error occurs. - -:: - - npy_bool - NpyIter_RequiresBuffering(NpyIter *iter) - -Whether the iteration could be done with no buffering. - -:: - - char ** - NpyIter_GetInitialDataPtrArray(NpyIter *iter) - -Get the array of data pointers (1 per object being iterated), -directly into the arrays (never pointing to a buffer), for starting -unbuffered iteration. This always returns the addresses for the -iterator position as reset to iterator index 0. - -These pointers are different from the pointers accepted by -NpyIter_ResetBasePointers, because the direction along some -axes may have been reversed, requiring base offsets. - -This function may be safely called without holding the Python GIL. - -:: - - int - NpyIter_CreateCompatibleStrides(NpyIter *iter, npy_intp - itemsize, npy_intp *outstrides) - -Builds a set of strides which are the same as the strides of an -output array created using the NPY_ITER_ALLOCATE flag, where NULL -was passed for op_axes. This is for data packed contiguously, -but not necessarily in C or Fortran order. This should be used -together with NpyIter_GetShape and NpyIter_GetNDim. - -A use case for this function is to match the shape and layout of -the iterator and tack on one or more dimensions. For example, -in order to generate a vector per input value for a numerical gradient, -you pass in ndim*itemsize for itemsize, then add another dimension to -the end with size ndim and stride itemsize. To do the Hessian matrix, -you do the same thing but add two dimensions, or take advantage of -the symmetry and pack it into 1 dimension with a particular encoding. - -This function may only be called if the iterator is tracking a multi-index -and if NPY_ITER_DONT_NEGATE_STRIDES was used to prevent an axis from -being iterated in reverse order. - -If an array is created with this method, simply adding 'itemsize' -for each iteration will traverse the new array matching the -iterator. - -Returns NPY_SUCCEED or NPY_FAIL. - -:: - - int - PyArray_CastingConverter(PyObject *obj, NPY_CASTING *casting) - -Convert any Python object, *obj*, to an NPY_CASTING enum. - -:: - - npy_intp - PyArray_CountNonzero(PyArrayObject *self) - -Counts the number of non-zero elements in the array. - -Returns -1 on error. - -:: - - PyArray_Descr * - PyArray_PromoteTypes(PyArray_Descr *type1, PyArray_Descr *type2) - -Produces the smallest size and lowest kind type to which both -input types can be cast. - -:: - - PyArray_Descr * - PyArray_MinScalarType(PyArrayObject *arr) - -If arr is a scalar (has 0 dimensions) with a built-in number data type, -finds the smallest type size/kind which can still represent its data. -Otherwise, returns the array's data type. - - -:: - - PyArray_Descr * - PyArray_ResultType(npy_intp narrs, PyArrayObject **arr, npy_intp - ndtypes, PyArray_Descr **dtypes) - -Produces the result type of a bunch of inputs, using the UFunc -type promotion rules. Use this function when you have a set of -input arrays, and need to determine an output array dtype. - -If all the inputs are scalars (have 0 dimensions) or the maximum "kind" -of the scalars is greater than the maximum "kind" of the arrays, does -a regular type promotion. - -Otherwise, does a type promotion on the MinScalarType -of all the inputs. Data types passed directly are treated as array -types. - - -:: - - npy_bool - PyArray_CanCastArrayTo(PyArrayObject *arr, PyArray_Descr - *to, NPY_CASTING casting) - -Returns 1 if the array object may be cast to the given data type using -the casting rule, 0 otherwise. This differs from PyArray_CanCastTo in -that it handles scalar arrays (0 dimensions) specially, by checking -their value. - -:: - - npy_bool - PyArray_CanCastTypeTo(PyArray_Descr *from, PyArray_Descr - *to, NPY_CASTING casting) - -Returns true if data of type 'from' may be cast to data of type -'to' according to the rule 'casting'. - -:: - - PyArrayObject * - PyArray_EinsteinSum(char *subscripts, npy_intp nop, PyArrayObject - **op_in, PyArray_Descr *dtype, NPY_ORDER - order, NPY_CASTING casting, PyArrayObject *out) - -This function provides summation of array elements according to -the Einstein summation convention. For example: -- trace(a) -> einsum("ii", a) -- transpose(a) -> einsum("ji", a) -- multiply(a,b) -> einsum(",", a, b) -- inner(a,b) -> einsum("i,i", a, b) -- outer(a,b) -> einsum("i,j", a, b) -- matvec(a,b) -> einsum("ij,j", a, b) -- matmat(a,b) -> einsum("ij,jk", a, b) - -subscripts: The string of subscripts for einstein summation. -nop: The number of operands -op_in: The array of operands -dtype: Either NULL, or the data type to force the calculation as. -order: The order for the calculation/the output axes. -casting: What kind of casts should be permitted. -out: Either NULL, or an array into which the output should be placed. - -By default, the labels get placed in alphabetical order -at the end of the output. So, if c = einsum("i,j", a, b) -then c[i,j] == a[i]*b[j], but if c = einsum("j,i", a, b) -then c[i,j] = a[j]*b[i]. - -Alternatively, you can control the output order or prevent -an axis from being summed/force an axis to be summed by providing -indices for the output. This allows us to turn 'trace' into -'diag', for example. -- diag(a) -> einsum("ii->i", a) -- sum(a, axis=0) -> einsum("i...->", a) - -Subscripts at the beginning and end may be specified by -putting an ellipsis "..." in the middle. For example, -the function einsum("i...i", a) takes the diagonal of -the first and last dimensions of the operand, and -einsum("ij...,jk...->ik...") takes the matrix product using -the first two indices of each operand instead of the last two. - -When there is only one operand, no axes being summed, and -no output parameter, this function returns a view -into the operand instead of making a copy. - -:: - - PyObject * - PyArray_NewLikeArray(PyArrayObject *prototype, NPY_ORDER - order, PyArray_Descr *dtype, int subok) - -Creates a new array with the same shape as the provided one, -with possible memory layout order and data type changes. - -prototype - The array the new one should be like. -order - NPY_CORDER - C-contiguous result. -NPY_FORTRANORDER - Fortran-contiguous result. -NPY_ANYORDER - Fortran if prototype is Fortran, C otherwise. -NPY_KEEPORDER - Keeps the axis ordering of prototype. -dtype - If not NULL, overrides the data type of the result. -subok - If 1, use the prototype's array subtype, otherwise -always create a base-class array. - -NOTE: If dtype is not NULL, steals the dtype reference. - -:: - - int - PyArray_GetArrayParamsFromObject(PyObject *op, PyArray_Descr - *requested_dtype, npy_bool - writeable, PyArray_Descr - **out_dtype, int *out_ndim, npy_intp - *out_dims, PyArrayObject - **out_arr, PyObject *context) - -Retrieves the array parameters for viewing/converting an arbitrary -PyObject* to a NumPy array. This allows the "innate type and shape" -of Python list-of-lists to be discovered without -actually converting to an array. - -In some cases, such as structured arrays and the __array__ interface, -a data type needs to be used to make sense of the object. When -this is needed, provide a Descr for 'requested_dtype', otherwise -provide NULL. This reference is not stolen. Also, if the requested -dtype doesn't modify the interpretation of the input, out_dtype will -still get the "innate" dtype of the object, not the dtype passed -in 'requested_dtype'. - -If writing to the value in 'op' is desired, set the boolean -'writeable' to 1. This raises an error when 'op' is a scalar, list -of lists, or other non-writeable 'op'. - -Result: When success (0 return value) is returned, either out_arr -is filled with a non-NULL PyArrayObject and -the rest of the parameters are untouched, or out_arr is -filled with NULL, and the rest of the parameters are -filled. - -Typical usage: - -PyArrayObject *arr = NULL; -PyArray_Descr *dtype = NULL; -int ndim = 0; -npy_intp dims[NPY_MAXDIMS]; - -if (PyArray_GetArrayParamsFromObject(op, NULL, 1, &dtype, -&ndim, &dims, &arr, NULL) < 0) { -return NULL; -} -if (arr == NULL) { -... validate/change dtype, validate flags, ndim, etc ... -// Could make custom strides here too -arr = PyArray_NewFromDescr(&PyArray_Type, dtype, ndim, -dims, NULL, -is_f_order ? NPY_ARRAY_F_CONTIGUOUS : 0, -NULL); -if (arr == NULL) { -return NULL; -} -if (PyArray_CopyObject(arr, op) < 0) { -Py_DECREF(arr); -return NULL; -} -} -else { -... in this case the other parameters weren't filled, just -validate and possibly copy arr itself ... -} -... use arr ... - -:: - - int - PyArray_ConvertClipmodeSequence(PyObject *object, NPY_CLIPMODE - *modes, int n) - -Convert an object to an array of n NPY_CLIPMODE values. -This is intended to be used in functions where a different mode -could be applied to each axis, like in ravel_multi_index. - -:: - - PyObject * - PyArray_MatrixProduct2(PyObject *op1, PyObject - *op2, PyArrayObject*out) - -Numeric.matrixproduct(a,v,out) -just like inner product but does the swapaxes stuff on the fly - -:: - - npy_bool - NpyIter_IsFirstVisit(NpyIter *iter, int iop) - -Checks to see whether this is the first time the elements -of the specified reduction operand which the iterator points at are -being seen for the first time. The function returns -a reasonable answer for reduction operands and when buffering is -disabled. The answer may be incorrect for buffered non-reduction -operands. - -This function is intended to be used in EXTERNAL_LOOP mode only, -and will produce some wrong answers when that mode is not enabled. - -If this function returns true, the caller should also -check the inner loop stride of the operand, because if -that stride is 0, then only the first element of the innermost -external loop is being visited for the first time. - -WARNING: For performance reasons, 'iop' is not bounds-checked, -it is not confirmed that 'iop' is actually a reduction -operand, and it is not confirmed that EXTERNAL_LOOP -mode is enabled. These checks are the responsibility of -the caller, and should be done outside of any inner loops. - -:: - - int - PyArray_SetBaseObject(PyArrayObject *arr, PyObject *obj) - -Sets the 'base' attribute of the array. This steals a reference -to 'obj'. - -Returns 0 on success, -1 on failure. - -:: - - void - PyArray_CreateSortedStridePerm(int ndim, npy_intp - *strides, npy_stride_sort_item - *out_strideperm) - - -This function populates the first ndim elements -of strideperm with sorted descending by their absolute values. -For example, the stride array (4, -2, 12) becomes -[(2, 12), (0, 4), (1, -2)]. - -:: - - void - PyArray_RemoveAxesInPlace(PyArrayObject *arr, npy_bool *flags) - - -Removes the axes flagged as True from the array, -modifying it in place. If an axis flagged for removal -has a shape entry bigger than one, this effectively selects -index zero for that axis. - -WARNING: If an axis flagged for removal has a shape equal to zero, -the array will point to invalid memory. The caller must -validate this! - -For example, this can be used to remove the reduction axes -from a reduction result once its computation is complete. - -:: - - void - PyArray_DebugPrint(PyArrayObject *obj) - -Prints the raw data of the ndarray in a form useful for debugging -low-level C issues. - -:: - - int - PyArray_FailUnlessWriteable(PyArrayObject *obj, const char *name) - - -This function does nothing if obj is writeable, and raises an exception -(and returns -1) if obj is not writeable. It may also do other -house-keeping, such as issuing warnings on arrays which are transitioning -to become views. Always call this function at some point before writing to -an array. - -'name' is a name for the array, used to give better error -messages. Something like "assignment destination", "output array", or even -just "array". - -:: - - int - PyArray_SetUpdateIfCopyBase(PyArrayObject *arr, PyArrayObject *base) - - -Precondition: 'arr' is a copy of 'base' (though possibly with different -strides, ordering, etc.). This function sets the UPDATEIFCOPY flag and the -->base pointer on 'arr', so that when 'arr' is destructed, it will copy any -changes back to 'base'. - -Steals a reference to 'base'. - -Returns 0 on success, -1 on failure. - -:: - - void * - PyDataMem_NEW(size_t size) - -Allocates memory for array data. - -:: - - void - PyDataMem_FREE(void *ptr) - -Free memory for array data. - -:: - - void * - PyDataMem_RENEW(void *ptr, size_t size) - -Reallocate/resize memory for array data. - -:: - - PyDataMem_EventHookFunc * - PyDataMem_SetEventHook(PyDataMem_EventHookFunc *newhook, void - *user_data, void **old_data) - -Sets the allocation event hook for numpy array data. -Takes a PyDataMem_EventHookFunc *, which has the signature: -void hook(void *old, void *new, size_t size, void *user_data). -Also takes a void *user_data, and void **old_data. - -Returns a pointer to the previous hook or NULL. If old_data is -non-NULL, the previous user_data pointer will be copied to it. - -If not NULL, hook will be called at the end of each PyDataMem_NEW/FREE/RENEW: -result = PyDataMem_NEW(size) -> (*hook)(NULL, result, size, user_data) -PyDataMem_FREE(ptr) -> (*hook)(ptr, NULL, 0, user_data) -result = PyDataMem_RENEW(ptr, size) -> (*hook)(ptr, result, size, user_data) - -When the hook is called, the GIL will be held by the calling -thread. The hook should be written to be reentrant, if it performs -operations that might cause new allocation events (such as the -creation/descruction numpy objects, or creating/destroying Python -objects which might cause a gc) - diff --git a/include/numpy/ndarrayobject.h b/include/numpy/ndarrayobject.h deleted file mode 100644 index f00dd7744..000000000 --- a/include/numpy/ndarrayobject.h +++ /dev/null @@ -1,244 +0,0 @@ -/* - * DON'T INCLUDE THIS DIRECTLY. - */ - -#ifndef NPY_NDARRAYOBJECT_H -#define NPY_NDARRAYOBJECT_H -#ifdef __cplusplus -#define CONFUSE_EMACS { -#define CONFUSE_EMACS2 } -extern "C" CONFUSE_EMACS -#undef CONFUSE_EMACS -#undef CONFUSE_EMACS2 -/* ... otherwise a semi-smart identer (like emacs) tries to indent - everything when you're typing */ -#endif - -#include "ndarraytypes.h" - -/* Includes the "function" C-API -- these are all stored in a - list of pointers --- one for each file - The two lists are concatenated into one in multiarray. - - They are available as import_array() -*/ - -#include "__multiarray_api.h" - - -/* C-API that requries previous API to be defined */ - -#define PyArray_DescrCheck(op) (((PyObject*)(op))->ob_type==&PyArrayDescr_Type) - -#define PyArray_Check(op) PyObject_TypeCheck(op, &PyArray_Type) -#define PyArray_CheckExact(op) (((PyObject*)(op))->ob_type == &PyArray_Type) - -#define PyArray_HasArrayInterfaceType(op, type, context, out) \ - ((((out)=PyArray_FromStructInterface(op)) != Py_NotImplemented) || \ - (((out)=PyArray_FromInterface(op)) != Py_NotImplemented) || \ - (((out)=PyArray_FromArrayAttr(op, type, context)) != \ - Py_NotImplemented)) - -#define PyArray_HasArrayInterface(op, out) \ - PyArray_HasArrayInterfaceType(op, NULL, NULL, out) - -#define PyArray_IsZeroDim(op) (PyArray_Check(op) && \ - (PyArray_NDIM((PyArrayObject *)op) == 0)) - -#define PyArray_IsScalar(obj, cls) \ - (PyObject_TypeCheck(obj, &Py##cls##ArrType_Type)) - -#define PyArray_CheckScalar(m) (PyArray_IsScalar(m, Generic) || \ - PyArray_IsZeroDim(m)) - -#define PyArray_IsPythonNumber(obj) \ - (PyInt_Check(obj) || PyFloat_Check(obj) || PyComplex_Check(obj) || \ - PyLong_Check(obj) || PyBool_Check(obj)) - -#define PyArray_IsPythonScalar(obj) \ - (PyArray_IsPythonNumber(obj) || PyString_Check(obj) || \ - PyUnicode_Check(obj)) - -#define PyArray_IsAnyScalar(obj) \ - (PyArray_IsScalar(obj, Generic) || PyArray_IsPythonScalar(obj)) - -#define PyArray_CheckAnyScalar(obj) (PyArray_IsPythonScalar(obj) || \ - PyArray_CheckScalar(obj)) - -#define PyArray_IsIntegerScalar(obj) (PyInt_Check(obj) \ - || PyLong_Check(obj) \ - || PyArray_IsScalar((obj), Integer)) - - -#define PyArray_GETCONTIGUOUS(m) (PyArray_ISCONTIGUOUS(m) ? \ - Py_INCREF(m), (m) : \ - (PyArrayObject *)(PyArray_Copy(m))) - -#define PyArray_SAMESHAPE(a1,a2) ((PyArray_NDIM(a1) == PyArray_NDIM(a2)) && \ - PyArray_CompareLists(PyArray_DIMS(a1), \ - PyArray_DIMS(a2), \ - PyArray_NDIM(a1))) - -#define PyArray_SIZE(m) PyArray_MultiplyList(PyArray_DIMS(m), PyArray_NDIM(m)) -#define PyArray_NBYTES(m) (PyArray_ITEMSIZE(m) * PyArray_SIZE(m)) -#define PyArray_FROM_O(m) PyArray_FromAny(m, NULL, 0, 0, 0, NULL) - -#define PyArray_FROM_OF(m,flags) PyArray_CheckFromAny(m, NULL, 0, 0, flags, \ - NULL) - -#define PyArray_FROM_OT(m,type) PyArray_FromAny(m, \ - PyArray_DescrFromType(type), 0, 0, 0, NULL); - -#define PyArray_FROM_OTF(m, type, flags) \ - PyArray_FromAny(m, PyArray_DescrFromType(type), 0, 0, \ - (((flags) & NPY_ARRAY_ENSURECOPY) ? \ - ((flags) | NPY_ARRAY_DEFAULT) : (flags)), NULL) - -#define PyArray_FROMANY(m, type, min, max, flags) \ - PyArray_FromAny(m, PyArray_DescrFromType(type), min, max, \ - (((flags) & NPY_ARRAY_ENSURECOPY) ? \ - (flags) | NPY_ARRAY_DEFAULT : (flags)), NULL) - -#define PyArray_ZEROS(m, dims, type, is_f_order) \ - PyArray_Zeros(m, dims, PyArray_DescrFromType(type), is_f_order) - -#define PyArray_EMPTY(m, dims, type, is_f_order) \ - PyArray_Empty(m, dims, PyArray_DescrFromType(type), is_f_order) - -#define PyArray_FILLWBYTE(obj, val) memset(PyArray_DATA(obj), val, \ - PyArray_NBYTES(obj)) - -#define PyArray_REFCOUNT(obj) (((PyObject *)(obj))->ob_refcnt) -#define NPY_REFCOUNT PyArray_REFCOUNT -#define NPY_MAX_ELSIZE (2 * NPY_SIZEOF_LONGDOUBLE) - -#define PyArray_ContiguousFromAny(op, type, min_depth, max_depth) \ - PyArray_FromAny(op, PyArray_DescrFromType(type), min_depth, \ - max_depth, NPY_ARRAY_DEFAULT, NULL) - -#define PyArray_EquivArrTypes(a1, a2) \ - PyArray_EquivTypes(PyArray_DESCR(a1), PyArray_DESCR(a2)) - -#define PyArray_EquivByteorders(b1, b2) \ - (((b1) == (b2)) || (PyArray_ISNBO(b1) == PyArray_ISNBO(b2))) - -#define PyArray_SimpleNew(nd, dims, typenum) \ - PyArray_New(&PyArray_Type, nd, dims, typenum, NULL, NULL, 0, 0, NULL) - -#define PyArray_SimpleNewFromData(nd, dims, typenum, data) \ - PyArray_New(&PyArray_Type, nd, dims, typenum, NULL, \ - data, 0, NPY_ARRAY_CARRAY, NULL) - -#define PyArray_SimpleNewFromDescr(nd, dims, descr) \ - PyArray_NewFromDescr(&PyArray_Type, descr, nd, dims, \ - NULL, NULL, 0, NULL) - -#define PyArray_ToScalar(data, arr) \ - PyArray_Scalar(data, PyArray_DESCR(arr), (PyObject *)arr) - - -/* These might be faster without the dereferencing of obj - going on inside -- of course an optimizing compiler should - inline the constants inside a for loop making it a moot point -*/ - -#define PyArray_GETPTR1(obj, i) ((void *)(PyArray_BYTES(obj) + \ - (i)*PyArray_STRIDES(obj)[0])) - -#define PyArray_GETPTR2(obj, i, j) ((void *)(PyArray_BYTES(obj) + \ - (i)*PyArray_STRIDES(obj)[0] + \ - (j)*PyArray_STRIDES(obj)[1])) - -#define PyArray_GETPTR3(obj, i, j, k) ((void *)(PyArray_BYTES(obj) + \ - (i)*PyArray_STRIDES(obj)[0] + \ - (j)*PyArray_STRIDES(obj)[1] + \ - (k)*PyArray_STRIDES(obj)[2])) - -#define PyArray_GETPTR4(obj, i, j, k, l) ((void *)(PyArray_BYTES(obj) + \ - (i)*PyArray_STRIDES(obj)[0] + \ - (j)*PyArray_STRIDES(obj)[1] + \ - (k)*PyArray_STRIDES(obj)[2] + \ - (l)*PyArray_STRIDES(obj)[3])) - -static NPY_INLINE void -PyArray_XDECREF_ERR(PyArrayObject *arr) -{ - if (arr != NULL) { - if (PyArray_FLAGS(arr) & NPY_ARRAY_UPDATEIFCOPY) { - PyArrayObject *base = (PyArrayObject *)PyArray_BASE(arr); - PyArray_ENABLEFLAGS(base, NPY_ARRAY_WRITEABLE); - PyArray_CLEARFLAGS(arr, NPY_ARRAY_UPDATEIFCOPY); - } - Py_DECREF(arr); - } -} - -#define PyArray_DESCR_REPLACE(descr) do { \ - PyArray_Descr *_new_; \ - _new_ = PyArray_DescrNew(descr); \ - Py_XDECREF(descr); \ - descr = _new_; \ - } while(0) - -/* Copy should always return contiguous array */ -#define PyArray_Copy(obj) PyArray_NewCopy(obj, NPY_CORDER) - -#define PyArray_FromObject(op, type, min_depth, max_depth) \ - PyArray_FromAny(op, PyArray_DescrFromType(type), min_depth, \ - max_depth, NPY_ARRAY_BEHAVED | \ - NPY_ARRAY_ENSUREARRAY, NULL) - -#define PyArray_ContiguousFromObject(op, type, min_depth, max_depth) \ - PyArray_FromAny(op, PyArray_DescrFromType(type), min_depth, \ - max_depth, NPY_ARRAY_DEFAULT | \ - NPY_ARRAY_ENSUREARRAY, NULL) - -#define PyArray_CopyFromObject(op, type, min_depth, max_depth) \ - PyArray_FromAny(op, PyArray_DescrFromType(type), min_depth, \ - max_depth, NPY_ARRAY_ENSURECOPY | \ - NPY_ARRAY_DEFAULT | \ - NPY_ARRAY_ENSUREARRAY, NULL) - -#define PyArray_Cast(mp, type_num) \ - PyArray_CastToType(mp, PyArray_DescrFromType(type_num), 0) - -#define PyArray_Take(ap, items, axis) \ - PyArray_TakeFrom(ap, items, axis, NULL, NPY_RAISE) - -#define PyArray_Put(ap, items, values) \ - PyArray_PutTo(ap, items, values, NPY_RAISE) - -/* Compatibility with old Numeric stuff -- don't use in new code */ - -#define PyArray_FromDimsAndData(nd, d, type, data) \ - PyArray_FromDimsAndDataAndDescr(nd, d, PyArray_DescrFromType(type), \ - data) - - -/* - Check to see if this key in the dictionary is the "title" - entry of the tuple (i.e. a duplicate dictionary entry in the fields - dict. -*/ - -#define NPY_TITLE_KEY(key, value) ((PyTuple_GET_SIZE((value))==3) && \ - (PyTuple_GET_ITEM((value), 2) == (key))) - - -/* Define python version independent deprecation macro */ - -#if PY_VERSION_HEX >= 0x02050000 -#define DEPRECATE(msg) PyErr_WarnEx(PyExc_DeprecationWarning,msg,1) -#define DEPRECATE_FUTUREWARNING(msg) PyErr_WarnEx(PyExc_FutureWarning,msg,1) -#else -#define DEPRECATE(msg) PyErr_Warn(PyExc_DeprecationWarning,msg) -#define DEPRECATE_FUTUREWARNING(msg) PyErr_Warn(PyExc_FutureWarning,msg) -#endif - - -#ifdef __cplusplus -} -#endif - - -#endif /* NPY_NDARRAYOBJECT_H */ diff --git a/include/numpy/ndarraytypes.h b/include/numpy/ndarraytypes.h deleted file mode 100644 index 04d037ec8..000000000 --- a/include/numpy/ndarraytypes.h +++ /dev/null @@ -1,1731 +0,0 @@ -#ifndef NDARRAYTYPES_H -#define NDARRAYTYPES_H - -/* numpyconfig.h is auto-generated by the installer */ -#include "numpyconfig.h" - -#include "npy_common.h" -#include "npy_endian.h" -#include "npy_cpu.h" -#include "utils.h" - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN -#else - #define NPY_NO_EXPORT static -#endif - -/* Only use thread if configured in config and python supports it */ -#if defined WITH_THREAD && !NPY_NO_SMP - #define NPY_ALLOW_THREADS 1 -#else - #define NPY_ALLOW_THREADS 0 -#endif - - - -/* - * There are several places in the code where an array of dimensions - * is allocated statically. This is the size of that static - * allocation. - * - * The array creation itself could have arbitrary dimensions but all - * the places where static allocation is used would need to be changed - * to dynamic (including inside of several structures) - */ - -#define NPY_MAXDIMS 32 -#define NPY_MAXARGS 32 - -/* Used for Converter Functions "O&" code in ParseTuple */ -#define NPY_FAIL 0 -#define NPY_SUCCEED 1 - -/* - * Binary compatibility version number. This number is increased - * whenever the C-API is changed such that binary compatibility is - * broken, i.e. whenever a recompile of extension modules is needed. - */ -#define NPY_VERSION NPY_ABI_VERSION - -/* - * Minor API version. This number is increased whenever a change is - * made to the C-API -- whether it breaks binary compatibility or not. - * Some changes, such as adding a function pointer to the end of the - * function table, can be made without breaking binary compatibility. - * In this case, only the NPY_FEATURE_VERSION (*not* NPY_VERSION) - * would be increased. Whenever binary compatibility is broken, both - * NPY_VERSION and NPY_FEATURE_VERSION should be increased. - */ -#define NPY_FEATURE_VERSION NPY_API_VERSION - -enum NPY_TYPES { NPY_BOOL=0, - NPY_BYTE, NPY_UBYTE, - NPY_SHORT, NPY_USHORT, - NPY_INT, NPY_UINT, - NPY_LONG, NPY_ULONG, - NPY_LONGLONG, NPY_ULONGLONG, - NPY_FLOAT, NPY_DOUBLE, NPY_LONGDOUBLE, - NPY_CFLOAT, NPY_CDOUBLE, NPY_CLONGDOUBLE, - NPY_OBJECT=17, - NPY_STRING, NPY_UNICODE, - NPY_VOID, - /* - * New 1.6 types appended, may be integrated - * into the above in 2.0. - */ - NPY_DATETIME, NPY_TIMEDELTA, NPY_HALF, - - NPY_NTYPES, - NPY_NOTYPE, - NPY_CHAR, /* special flag */ - NPY_USERDEF=256, /* leave room for characters */ - - /* The number of types not including the new 1.6 types */ - NPY_NTYPES_ABI_COMPATIBLE=21 -}; - -/* basetype array priority */ -#define NPY_PRIORITY 0.0 - -/* default subtype priority */ -#define NPY_SUBTYPE_PRIORITY 1.0 - -/* default scalar priority */ -#define NPY_SCALAR_PRIORITY -1000000.0 - -/* How many floating point types are there (excluding half) */ -#define NPY_NUM_FLOATTYPE 3 - -/* - * These characters correspond to the array type and the struct - * module - */ - -enum NPY_TYPECHAR { - NPY_BOOLLTR = '?', - NPY_BYTELTR = 'b', - NPY_UBYTELTR = 'B', - NPY_SHORTLTR = 'h', - NPY_USHORTLTR = 'H', - NPY_INTLTR = 'i', - NPY_UINTLTR = 'I', - NPY_LONGLTR = 'l', - NPY_ULONGLTR = 'L', - NPY_LONGLONGLTR = 'q', - NPY_ULONGLONGLTR = 'Q', - NPY_HALFLTR = 'e', - NPY_FLOATLTR = 'f', - NPY_DOUBLELTR = 'd', - NPY_LONGDOUBLELTR = 'g', - NPY_CFLOATLTR = 'F', - NPY_CDOUBLELTR = 'D', - NPY_CLONGDOUBLELTR = 'G', - NPY_OBJECTLTR = 'O', - NPY_STRINGLTR = 'S', - NPY_STRINGLTR2 = 'a', - NPY_UNICODELTR = 'U', - NPY_VOIDLTR = 'V', - NPY_DATETIMELTR = 'M', - NPY_TIMEDELTALTR = 'm', - NPY_CHARLTR = 'c', - - /* - * No Descriptor, just a define -- this let's - * Python users specify an array of integers - * large enough to hold a pointer on the - * platform - */ - NPY_INTPLTR = 'p', - NPY_UINTPLTR = 'P', - - /* - * These are for dtype 'kinds', not dtype 'typecodes' - * as the above are for. - */ - NPY_GENBOOLLTR ='b', - NPY_SIGNEDLTR = 'i', - NPY_UNSIGNEDLTR = 'u', - NPY_FLOATINGLTR = 'f', - NPY_COMPLEXLTR = 'c' -}; - -typedef enum { - NPY_QUICKSORT=0, - NPY_HEAPSORT=1, - NPY_MERGESORT=2 -} NPY_SORTKIND; -#define NPY_NSORTS (NPY_MERGESORT + 1) - - -typedef enum { - NPY_SEARCHLEFT=0, - NPY_SEARCHRIGHT=1 -} NPY_SEARCHSIDE; -#define NPY_NSEARCHSIDES (NPY_SEARCHRIGHT + 1) - - -typedef enum { - NPY_NOSCALAR=-1, - NPY_BOOL_SCALAR, - NPY_INTPOS_SCALAR, - NPY_INTNEG_SCALAR, - NPY_FLOAT_SCALAR, - NPY_COMPLEX_SCALAR, - NPY_OBJECT_SCALAR -} NPY_SCALARKIND; -#define NPY_NSCALARKINDS (NPY_OBJECT_SCALAR + 1) - -/* For specifying array memory layout or iteration order */ -typedef enum { - /* Fortran order if inputs are all Fortran, C otherwise */ - NPY_ANYORDER=-1, - /* C order */ - NPY_CORDER=0, - /* Fortran order */ - NPY_FORTRANORDER=1, - /* An order as close to the inputs as possible */ - NPY_KEEPORDER=2 -} NPY_ORDER; - -/* For specifying allowed casting in operations which support it */ -typedef enum { - /* Only allow identical types */ - NPY_NO_CASTING=0, - /* Allow identical and byte swapped types */ - NPY_EQUIV_CASTING=1, - /* Only allow safe casts */ - NPY_SAFE_CASTING=2, - /* Allow safe casts or casts within the same kind */ - NPY_SAME_KIND_CASTING=3, - /* Allow any casts */ - NPY_UNSAFE_CASTING=4, - - /* - * Temporary internal definition only, will be removed in upcoming - * release, see below - * */ - NPY_INTERNAL_UNSAFE_CASTING_BUT_WARN_UNLESS_SAME_KIND = 100, -} NPY_CASTING; - -typedef enum { - NPY_CLIP=0, - NPY_WRAP=1, - NPY_RAISE=2 -} NPY_CLIPMODE; - -/* The special not-a-time (NaT) value */ -#define NPY_DATETIME_NAT NPY_MIN_INT64 - -/* - * Upper bound on the length of a DATETIME ISO 8601 string - * YEAR: 21 (64-bit year) - * MONTH: 3 - * DAY: 3 - * HOURS: 3 - * MINUTES: 3 - * SECONDS: 3 - * ATTOSECONDS: 1 + 3*6 - * TIMEZONE: 5 - * NULL TERMINATOR: 1 - */ -#define NPY_DATETIME_MAX_ISO8601_STRLEN (21+3*5+1+3*6+6+1) - -typedef enum { - NPY_FR_Y = 0, /* Years */ - NPY_FR_M = 1, /* Months */ - NPY_FR_W = 2, /* Weeks */ - /* Gap where 1.6 NPY_FR_B (value 3) was */ - NPY_FR_D = 4, /* Days */ - NPY_FR_h = 5, /* hours */ - NPY_FR_m = 6, /* minutes */ - NPY_FR_s = 7, /* seconds */ - NPY_FR_ms = 8, /* milliseconds */ - NPY_FR_us = 9, /* microseconds */ - NPY_FR_ns = 10,/* nanoseconds */ - NPY_FR_ps = 11,/* picoseconds */ - NPY_FR_fs = 12,/* femtoseconds */ - NPY_FR_as = 13,/* attoseconds */ - NPY_FR_GENERIC = 14 /* Generic, unbound units, can convert to anything */ -} NPY_DATETIMEUNIT; - -/* - * NOTE: With the NPY_FR_B gap for 1.6 ABI compatibility, NPY_DATETIME_NUMUNITS - * is technically one more than the actual number of units. - */ -#define NPY_DATETIME_NUMUNITS (NPY_FR_GENERIC + 1) -#define NPY_DATETIME_DEFAULTUNIT NPY_FR_GENERIC - -/* - * Business day conventions for mapping invalid business - * days to valid business days. - */ -typedef enum { - /* Go forward in time to the following business day. */ - NPY_BUSDAY_FORWARD, - NPY_BUSDAY_FOLLOWING = NPY_BUSDAY_FORWARD, - /* Go backward in time to the preceding business day. */ - NPY_BUSDAY_BACKWARD, - NPY_BUSDAY_PRECEDING = NPY_BUSDAY_BACKWARD, - /* - * Go forward in time to the following business day, unless it - * crosses a month boundary, in which case go backward - */ - NPY_BUSDAY_MODIFIEDFOLLOWING, - /* - * Go backward in time to the preceding business day, unless it - * crosses a month boundary, in which case go forward. - */ - NPY_BUSDAY_MODIFIEDPRECEDING, - /* Produce a NaT for non-business days. */ - NPY_BUSDAY_NAT, - /* Raise an exception for non-business days. */ - NPY_BUSDAY_RAISE -} NPY_BUSDAY_ROLL; - -/************************************************************ - * NumPy Auxiliary Data for inner loops, sort functions, etc. - ************************************************************/ - -/* - * When creating an auxiliary data struct, this should always appear - * as the first member, like this: - * - * typedef struct { - * NpyAuxData base; - * double constant; - * } constant_multiplier_aux_data; - */ -typedef struct NpyAuxData_tag NpyAuxData; - -/* Function pointers for freeing or cloning auxiliary data */ -typedef void (NpyAuxData_FreeFunc) (NpyAuxData *); -typedef NpyAuxData *(NpyAuxData_CloneFunc) (NpyAuxData *); - -struct NpyAuxData_tag { - NpyAuxData_FreeFunc *free; - NpyAuxData_CloneFunc *clone; - /* To allow for a bit of expansion without breaking the ABI */ - void *reserved[2]; -}; - -/* Macros to use for freeing and cloning auxiliary data */ -#define NPY_AUXDATA_FREE(auxdata) \ - do { \ - if ((auxdata) != NULL) { \ - (auxdata)->free(auxdata); \ - } \ - } while(0) -#define NPY_AUXDATA_CLONE(auxdata) \ - ((auxdata)->clone(auxdata)) - -#define NPY_ERR(str) fprintf(stderr, #str); fflush(stderr); -#define NPY_ERR2(str) fprintf(stderr, str); fflush(stderr); - -#define NPY_STRINGIFY(x) #x -#define NPY_TOSTRING(x) NPY_STRINGIFY(x) - - /* - * Macros to define how array, and dimension/strides data is - * allocated. - */ - - /* Data buffer - PyDataMem_NEW/FREE/RENEW are in multiarraymodule.c */ - -#define NPY_USE_PYMEM 1 - -#if NPY_USE_PYMEM == 1 -#define PyArray_malloc PyMem_Malloc -#define PyArray_free PyMem_Free -#define PyArray_realloc PyMem_Realloc -#else -#define PyArray_malloc malloc -#define PyArray_free free -#define PyArray_realloc realloc -#endif - -/* Dimensions and strides */ -#define PyDimMem_NEW(size) \ - ((npy_intp *)PyArray_malloc(size*sizeof(npy_intp))) - -#define PyDimMem_FREE(ptr) PyArray_free(ptr) - -#define PyDimMem_RENEW(ptr,size) \ - ((npy_intp *)PyArray_realloc(ptr,size*sizeof(npy_intp))) - -/* forward declaration */ -struct _PyArray_Descr; - -/* These must deal with unaligned and swapped data if necessary */ -typedef PyObject * (PyArray_GetItemFunc) (void *, void *); -typedef int (PyArray_SetItemFunc)(PyObject *, void *, void *); - -typedef void (PyArray_CopySwapNFunc)(void *, npy_intp, void *, npy_intp, - npy_intp, int, void *); - -typedef void (PyArray_CopySwapFunc)(void *, void *, int, void *); -typedef npy_bool (PyArray_NonzeroFunc)(void *, void *); - - -/* - * These assume aligned and notswapped data -- a buffer will be used - * before or contiguous data will be obtained - */ - -typedef int (PyArray_CompareFunc)(const void *, const void *, void *); -typedef int (PyArray_ArgFunc)(void*, npy_intp, npy_intp*, void *); - -typedef void (PyArray_DotFunc)(void *, npy_intp, void *, npy_intp, void *, - npy_intp, void *); - -typedef void (PyArray_VectorUnaryFunc)(void *, void *, npy_intp, void *, - void *); - -/* - * XXX the ignore argument should be removed next time the API version - * is bumped. It used to be the separator. - */ -typedef int (PyArray_ScanFunc)(FILE *fp, void *dptr, - char *ignore, struct _PyArray_Descr *); -typedef int (PyArray_FromStrFunc)(char *s, void *dptr, char **endptr, - struct _PyArray_Descr *); - -typedef int (PyArray_FillFunc)(void *, npy_intp, void *); - -typedef int (PyArray_SortFunc)(void *, npy_intp, void *); -typedef int (PyArray_ArgSortFunc)(void *, npy_intp *, npy_intp, void *); - -typedef int (PyArray_FillWithScalarFunc)(void *, npy_intp, void *, void *); - -typedef int (PyArray_ScalarKindFunc)(void *); - -typedef void (PyArray_FastClipFunc)(void *in, npy_intp n_in, void *min, - void *max, void *out); -typedef void (PyArray_FastPutmaskFunc)(void *in, void *mask, npy_intp n_in, - void *values, npy_intp nv); -typedef int (PyArray_FastTakeFunc)(void *dest, void *src, npy_intp *indarray, - npy_intp nindarray, npy_intp n_outer, - npy_intp m_middle, npy_intp nelem, - NPY_CLIPMODE clipmode); - -typedef struct { - npy_intp *ptr; - int len; -} PyArray_Dims; - -typedef struct { - /* - * Functions to cast to most other standard types - * Can have some NULL entries. The types - * DATETIME, TIMEDELTA, and HALF go into the castdict - * even though they are built-in. - */ - PyArray_VectorUnaryFunc *cast[NPY_NTYPES_ABI_COMPATIBLE]; - - /* The next four functions *cannot* be NULL */ - - /* - * Functions to get and set items with standard Python types - * -- not array scalars - */ - PyArray_GetItemFunc *getitem; - PyArray_SetItemFunc *setitem; - - /* - * Copy and/or swap data. Memory areas may not overlap - * Use memmove first if they might - */ - PyArray_CopySwapNFunc *copyswapn; - PyArray_CopySwapFunc *copyswap; - - /* - * Function to compare items - * Can be NULL - */ - PyArray_CompareFunc *compare; - - /* - * Function to select largest - * Can be NULL - */ - PyArray_ArgFunc *argmax; - - /* - * Function to compute dot product - * Can be NULL - */ - PyArray_DotFunc *dotfunc; - - /* - * Function to scan an ASCII file and - * place a single value plus possible separator - * Can be NULL - */ - PyArray_ScanFunc *scanfunc; - - /* - * Function to read a single value from a string - * and adjust the pointer; Can be NULL - */ - PyArray_FromStrFunc *fromstr; - - /* - * Function to determine if data is zero or not - * If NULL a default version is - * used at Registration time. - */ - PyArray_NonzeroFunc *nonzero; - - /* - * Used for arange. - * Can be NULL. - */ - PyArray_FillFunc *fill; - - /* - * Function to fill arrays with scalar values - * Can be NULL - */ - PyArray_FillWithScalarFunc *fillwithscalar; - - /* - * Sorting functions - * Can be NULL - */ - PyArray_SortFunc *sort[NPY_NSORTS]; - PyArray_ArgSortFunc *argsort[NPY_NSORTS]; - - /* - * Dictionary of additional casting functions - * PyArray_VectorUnaryFuncs - * which can be populated to support casting - * to other registered types. Can be NULL - */ - PyObject *castdict; - - /* - * Functions useful for generalizing - * the casting rules. - * Can be NULL; - */ - PyArray_ScalarKindFunc *scalarkind; - int **cancastscalarkindto; - int *cancastto; - - PyArray_FastClipFunc *fastclip; - PyArray_FastPutmaskFunc *fastputmask; - PyArray_FastTakeFunc *fasttake; - - /* - * Function to select smallest - * Can be NULL - */ - PyArray_ArgFunc *argmin; - -} PyArray_ArrFuncs; - -/* The item must be reference counted when it is inserted or extracted. */ -#define NPY_ITEM_REFCOUNT 0x01 -/* Same as needing REFCOUNT */ -#define NPY_ITEM_HASOBJECT 0x01 -/* Convert to list for pickling */ -#define NPY_LIST_PICKLE 0x02 -/* The item is a POINTER */ -#define NPY_ITEM_IS_POINTER 0x04 -/* memory needs to be initialized for this data-type */ -#define NPY_NEEDS_INIT 0x08 -/* operations need Python C-API so don't give-up thread. */ -#define NPY_NEEDS_PYAPI 0x10 -/* Use f.getitem when extracting elements of this data-type */ -#define NPY_USE_GETITEM 0x20 -/* Use f.setitem when setting creating 0-d array from this data-type.*/ -#define NPY_USE_SETITEM 0x40 -/* A sticky flag specifically for structured arrays */ -#define NPY_ALIGNED_STRUCT 0x80 - -/* - *These are inherited for global data-type if any data-types in the - * field have them - */ -#define NPY_FROM_FIELDS (NPY_NEEDS_INIT | NPY_LIST_PICKLE | \ - NPY_ITEM_REFCOUNT | NPY_NEEDS_PYAPI) - -#define NPY_OBJECT_DTYPE_FLAGS (NPY_LIST_PICKLE | NPY_USE_GETITEM | \ - NPY_ITEM_IS_POINTER | NPY_ITEM_REFCOUNT | \ - NPY_NEEDS_INIT | NPY_NEEDS_PYAPI) - -#define PyDataType_FLAGCHK(dtype, flag) \ - (((dtype)->flags & (flag)) == (flag)) - -#define PyDataType_REFCHK(dtype) \ - PyDataType_FLAGCHK(dtype, NPY_ITEM_REFCOUNT) - -typedef struct _PyArray_Descr { - PyObject_HEAD - /* - * the type object representing an - * instance of this type -- should not - * be two type_numbers with the same type - * object. - */ - PyTypeObject *typeobj; - /* kind for this type */ - char kind; - /* unique-character representing this type */ - char type; - /* - * '>' (big), '<' (little), '|' - * (not-applicable), or '=' (native). - */ - char byteorder; - /* flags describing data type */ - char flags; - /* number representing this type */ - int type_num; - /* element size (itemsize) for this type */ - int elsize; - /* alignment needed for this type */ - int alignment; - /* - * Non-NULL if this type is - * is an array (C-contiguous) - * of some other type - */ - struct _arr_descr *subarray; - /* - * The fields dictionary for this type - * For statically defined descr this - * is always Py_None - */ - PyObject *fields; - /* - * An ordered tuple of field names or NULL - * if no fields are defined - */ - PyObject *names; - /* - * a table of functions specific for each - * basic data descriptor - */ - PyArray_ArrFuncs *f; - /* Metadata about this dtype */ - PyObject *metadata; - /* - * Metadata specific to the C implementation - * of the particular dtype. This was added - * for NumPy 1.7.0. - */ - NpyAuxData *c_metadata; -} PyArray_Descr; - -typedef struct _arr_descr { - PyArray_Descr *base; - PyObject *shape; /* a tuple */ -} PyArray_ArrayDescr; - -/* - * The main array object structure. - * - * It has been recommended to use the inline functions defined below - * (PyArray_DATA and friends) to access fields here for a number of - * releases. Direct access to the members themselves is deprecated. - * To ensure that your code does not use deprecated access, - * #define NPY_NO_DEPRECATED_API NPY_1_7_VERSION - * (or NPY_1_8_VERSION or higher as required). - */ -/* This struct will be moved to a private header in a future release */ -typedef struct tagPyArrayObject_fields { - PyObject_HEAD - /* Pointer to the raw data buffer */ - char *data; - /* The number of dimensions, also called 'ndim' */ - int nd; - /* The size in each dimension, also called 'shape' */ - npy_intp *dimensions; - /* - * Number of bytes to jump to get to the - * next element in each dimension - */ - npy_intp *strides; - /* - * This object is decref'd upon - * deletion of array. Except in the - * case of UPDATEIFCOPY which has - * special handling. - * - * For views it points to the original - * array, collapsed so no chains of - * views occur. - * - * For creation from buffer object it - * points to an object that shold be - * decref'd on deletion - * - * For UPDATEIFCOPY flag this is an - * array to-be-updated upon deletion - * of this one - */ - PyObject *base; - /* Pointer to type structure */ - PyArray_Descr *descr; - /* Flags describing array -- see below */ - int flags; - /* For weak references */ - PyObject *weakreflist; -} PyArrayObject_fields; - -/* - * To hide the implementation details, we only expose - * the Python struct HEAD. - */ -#if !(defined(NPY_NO_DEPRECATED_API) && (NPY_API_VERSION <= NPY_NO_DEPRECATED_API)) -/* - * Can't put this in npy_deprecated_api.h like the others. - * PyArrayObject field access is deprecated as of NumPy 1.7. - */ -typedef PyArrayObject_fields PyArrayObject; -#else -typedef struct tagPyArrayObject { - PyObject_HEAD -} PyArrayObject; -#endif - -#define NPY_SIZEOF_PYARRAYOBJECT (sizeof(PyArrayObject_fields)) - -/* Array Flags Object */ -typedef struct PyArrayFlagsObject { - PyObject_HEAD - PyObject *arr; - int flags; -} PyArrayFlagsObject; - -/* Mirrors buffer object to ptr */ - -typedef struct { - PyObject_HEAD - PyObject *base; - void *ptr; - npy_intp len; - int flags; -} PyArray_Chunk; - -typedef struct { - NPY_DATETIMEUNIT base; - int num; -} PyArray_DatetimeMetaData; - -typedef struct { - NpyAuxData base; - PyArray_DatetimeMetaData meta; -} PyArray_DatetimeDTypeMetaData; - -/* - * This structure contains an exploded view of a date-time value. - * NaT is represented by year == NPY_DATETIME_NAT. - */ -typedef struct { - npy_int64 year; - npy_int32 month, day, hour, min, sec, us, ps, as; -} npy_datetimestruct; - -/* This is not used internally. */ -typedef struct { - npy_int64 day; - npy_int32 sec, us, ps, as; -} npy_timedeltastruct; - -typedef int (PyArray_FinalizeFunc)(PyArrayObject *, PyObject *); - -/* - * Means c-style contiguous (last index varies the fastest). The data - * elements right after each other. - * - * This flag may be requested in constructor functions. - * This flag may be tested for in PyArray_FLAGS(arr). - */ -#define NPY_ARRAY_C_CONTIGUOUS 0x0001 - -/* - * Set if array is a contiguous Fortran array: the first index varies - * the fastest in memory (strides array is reverse of C-contiguous - * array) - * - * This flag may be requested in constructor functions. - * This flag may be tested for in PyArray_FLAGS(arr). - */ -#define NPY_ARRAY_F_CONTIGUOUS 0x0002 - -/* - * Note: all 0-d arrays are C_CONTIGUOUS and F_CONTIGUOUS. If a - * 1-d array is C_CONTIGUOUS it is also F_CONTIGUOUS - */ - -/* - * If set, the array owns the data: it will be free'd when the array - * is deleted. - * - * This flag may be tested for in PyArray_FLAGS(arr). - */ -#define NPY_ARRAY_OWNDATA 0x0004 - -/* - * An array never has the next four set; they're only used as parameter - * flags to the the various FromAny functions - * - * This flag may be requested in constructor functions. - */ - -/* Cause a cast to occur regardless of whether or not it is safe. */ -#define NPY_ARRAY_FORCECAST 0x0010 - -/* - * Always copy the array. Returned arrays are always CONTIGUOUS, - * ALIGNED, and WRITEABLE. - * - * This flag may be requested in constructor functions. - */ -#define NPY_ARRAY_ENSURECOPY 0x0020 - -/* - * Make sure the returned array is a base-class ndarray - * - * This flag may be requested in constructor functions. - */ -#define NPY_ARRAY_ENSUREARRAY 0x0040 - -/* - * Make sure that the strides are in units of the element size Needed - * for some operations with record-arrays. - * - * This flag may be requested in constructor functions. - */ -#define NPY_ARRAY_ELEMENTSTRIDES 0x0080 - -/* - * Array data is aligned on the appropiate memory address for the type - * stored according to how the compiler would align things (e.g., an - * array of integers (4 bytes each) starts on a memory address that's - * a multiple of 4) - * - * This flag may be requested in constructor functions. - * This flag may be tested for in PyArray_FLAGS(arr). - */ -#define NPY_ARRAY_ALIGNED 0x0100 - -/* - * Array data has the native endianness - * - * This flag may be requested in constructor functions. - */ -#define NPY_ARRAY_NOTSWAPPED 0x0200 - -/* - * Array data is writeable - * - * This flag may be requested in constructor functions. - * This flag may be tested for in PyArray_FLAGS(arr). - */ -#define NPY_ARRAY_WRITEABLE 0x0400 - -/* - * If this flag is set, then base contains a pointer to an array of - * the same size that should be updated with the current contents of - * this array when this array is deallocated - * - * This flag may be requested in constructor functions. - * This flag may be tested for in PyArray_FLAGS(arr). - */ -#define NPY_ARRAY_UPDATEIFCOPY 0x1000 - -/* - * NOTE: there are also internal flags defined in multiarray/arrayobject.h, - * which start at bit 31 and work down. - */ - -#define NPY_ARRAY_BEHAVED (NPY_ARRAY_ALIGNED | \ - NPY_ARRAY_WRITEABLE) -#define NPY_ARRAY_BEHAVED_NS (NPY_ARRAY_ALIGNED | \ - NPY_ARRAY_WRITEABLE | \ - NPY_ARRAY_NOTSWAPPED) -#define NPY_ARRAY_CARRAY (NPY_ARRAY_C_CONTIGUOUS | \ - NPY_ARRAY_BEHAVED) -#define NPY_ARRAY_CARRAY_RO (NPY_ARRAY_C_CONTIGUOUS | \ - NPY_ARRAY_ALIGNED) -#define NPY_ARRAY_FARRAY (NPY_ARRAY_F_CONTIGUOUS | \ - NPY_ARRAY_BEHAVED) -#define NPY_ARRAY_FARRAY_RO (NPY_ARRAY_F_CONTIGUOUS | \ - NPY_ARRAY_ALIGNED) -#define NPY_ARRAY_DEFAULT (NPY_ARRAY_CARRAY) -#define NPY_ARRAY_IN_ARRAY (NPY_ARRAY_CARRAY_RO) -#define NPY_ARRAY_OUT_ARRAY (NPY_ARRAY_CARRAY) -#define NPY_ARRAY_INOUT_ARRAY (NPY_ARRAY_CARRAY | \ - NPY_ARRAY_UPDATEIFCOPY) -#define NPY_ARRAY_IN_FARRAY (NPY_ARRAY_FARRAY_RO) -#define NPY_ARRAY_OUT_FARRAY (NPY_ARRAY_FARRAY) -#define NPY_ARRAY_INOUT_FARRAY (NPY_ARRAY_FARRAY | \ - NPY_ARRAY_UPDATEIFCOPY) - -#define NPY_ARRAY_UPDATE_ALL (NPY_ARRAY_C_CONTIGUOUS | \ - NPY_ARRAY_F_CONTIGUOUS | \ - NPY_ARRAY_ALIGNED) - -/* This flag is for the array interface, not PyArrayObject */ -#define NPY_ARR_HAS_DESCR 0x0800 - - - - -/* - * Size of internal buffers used for alignment Make BUFSIZE a multiple - * of sizeof(npy_cdouble) -- usually 16 so that ufunc buffers are aligned - */ -#define NPY_MIN_BUFSIZE ((int)sizeof(npy_cdouble)) -#define NPY_MAX_BUFSIZE (((int)sizeof(npy_cdouble))*1000000) -#define NPY_BUFSIZE 8192 -/* buffer stress test size: */ -/*#define NPY_BUFSIZE 17*/ - -#define PyArray_MAX(a,b) (((a)>(b))?(a):(b)) -#define PyArray_MIN(a,b) (((a)<(b))?(a):(b)) -#define PyArray_CLT(p,q) ((((p).real==(q).real) ? ((p).imag < (q).imag) : \ - ((p).real < (q).real))) -#define PyArray_CGT(p,q) ((((p).real==(q).real) ? ((p).imag > (q).imag) : \ - ((p).real > (q).real))) -#define PyArray_CLE(p,q) ((((p).real==(q).real) ? ((p).imag <= (q).imag) : \ - ((p).real <= (q).real))) -#define PyArray_CGE(p,q) ((((p).real==(q).real) ? ((p).imag >= (q).imag) : \ - ((p).real >= (q).real))) -#define PyArray_CEQ(p,q) (((p).real==(q).real) && ((p).imag == (q).imag)) -#define PyArray_CNE(p,q) (((p).real!=(q).real) || ((p).imag != (q).imag)) - -/* - * C API: consists of Macros and functions. The MACROS are defined - * here. - */ - - -#define PyArray_ISCONTIGUOUS(m) PyArray_CHKFLAGS(m, NPY_ARRAY_C_CONTIGUOUS) -#define PyArray_ISWRITEABLE(m) PyArray_CHKFLAGS(m, NPY_ARRAY_WRITEABLE) -#define PyArray_ISALIGNED(m) PyArray_CHKFLAGS(m, NPY_ARRAY_ALIGNED) - -#define PyArray_IS_C_CONTIGUOUS(m) PyArray_CHKFLAGS(m, NPY_ARRAY_C_CONTIGUOUS) -#define PyArray_IS_F_CONTIGUOUS(m) PyArray_CHKFLAGS(m, NPY_ARRAY_F_CONTIGUOUS) - -#if NPY_ALLOW_THREADS -#define NPY_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS -#define NPY_END_ALLOW_THREADS Py_END_ALLOW_THREADS -#define NPY_BEGIN_THREADS_DEF PyThreadState *_save=NULL; -#define NPY_BEGIN_THREADS do {_save = PyEval_SaveThread();} while (0); -#define NPY_END_THREADS do {if (_save) PyEval_RestoreThread(_save);} while (0); - -#define NPY_BEGIN_THREADS_DESCR(dtype) \ - do {if (!(PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI))) \ - NPY_BEGIN_THREADS;} while (0); - -#define NPY_END_THREADS_DESCR(dtype) \ - do {if (!(PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI))) \ - NPY_END_THREADS; } while (0); - -#define NPY_ALLOW_C_API_DEF PyGILState_STATE __save__; -#define NPY_ALLOW_C_API do {__save__ = PyGILState_Ensure();} while (0); -#define NPY_DISABLE_C_API do {PyGILState_Release(__save__);} while (0); -#else -#define NPY_BEGIN_ALLOW_THREADS -#define NPY_END_ALLOW_THREADS -#define NPY_BEGIN_THREADS_DEF -#define NPY_BEGIN_THREADS -#define NPY_END_THREADS -#define NPY_BEGIN_THREADS_DESCR(dtype) -#define NPY_END_THREADS_DESCR(dtype) -#define NPY_ALLOW_C_API_DEF -#define NPY_ALLOW_C_API -#define NPY_DISABLE_C_API -#endif - -/********************************** - * The nditer object, added in 1.6 - **********************************/ - -/* The actual structure of the iterator is an internal detail */ -typedef struct NpyIter_InternalOnly NpyIter; - -/* Iterator function pointers that may be specialized */ -typedef int (NpyIter_IterNextFunc)(NpyIter *iter); -typedef void (NpyIter_GetMultiIndexFunc)(NpyIter *iter, - npy_intp *outcoords); - -/*** Global flags that may be passed to the iterator constructors ***/ - -/* Track an index representing C order */ -#define NPY_ITER_C_INDEX 0x00000001 -/* Track an index representing Fortran order */ -#define NPY_ITER_F_INDEX 0x00000002 -/* Track a multi-index */ -#define NPY_ITER_MULTI_INDEX 0x00000004 -/* User code external to the iterator does the 1-dimensional innermost loop */ -#define NPY_ITER_EXTERNAL_LOOP 0x00000008 -/* Convert all the operands to a common data type */ -#define NPY_ITER_COMMON_DTYPE 0x00000010 -/* Operands may hold references, requiring API access during iteration */ -#define NPY_ITER_REFS_OK 0x00000020 -/* Zero-sized operands should be permitted, iteration checks IterSize for 0 */ -#define NPY_ITER_ZEROSIZE_OK 0x00000040 -/* Permits reductions (size-0 stride with dimension size > 1) */ -#define NPY_ITER_REDUCE_OK 0x00000080 -/* Enables sub-range iteration */ -#define NPY_ITER_RANGED 0x00000100 -/* Enables buffering */ -#define NPY_ITER_BUFFERED 0x00000200 -/* When buffering is enabled, grows the inner loop if possible */ -#define NPY_ITER_GROWINNER 0x00000400 -/* Delay allocation of buffers until first Reset* call */ -#define NPY_ITER_DELAY_BUFALLOC 0x00000800 -/* When NPY_KEEPORDER is specified, disable reversing negative-stride axes */ -#define NPY_ITER_DONT_NEGATE_STRIDES 0x00001000 - -/*** Per-operand flags that may be passed to the iterator constructors ***/ - -/* The operand will be read from and written to */ -#define NPY_ITER_READWRITE 0x00010000 -/* The operand will only be read from */ -#define NPY_ITER_READONLY 0x00020000 -/* The operand will only be written to */ -#define NPY_ITER_WRITEONLY 0x00040000 -/* The operand's data must be in native byte order */ -#define NPY_ITER_NBO 0x00080000 -/* The operand's data must be aligned */ -#define NPY_ITER_ALIGNED 0x00100000 -/* The operand's data must be contiguous (within the inner loop) */ -#define NPY_ITER_CONTIG 0x00200000 -/* The operand may be copied to satisfy requirements */ -#define NPY_ITER_COPY 0x00400000 -/* The operand may be copied with UPDATEIFCOPY to satisfy requirements */ -#define NPY_ITER_UPDATEIFCOPY 0x00800000 -/* Allocate the operand if it is NULL */ -#define NPY_ITER_ALLOCATE 0x01000000 -/* If an operand is allocated, don't use any subtype */ -#define NPY_ITER_NO_SUBTYPE 0x02000000 -/* This is a virtual array slot, operand is NULL but temporary data is there */ -#define NPY_ITER_VIRTUAL 0x04000000 -/* Require that the dimension match the iterator dimensions exactly */ -#define NPY_ITER_NO_BROADCAST 0x08000000 -/* A mask is being used on this array, affects buffer -> array copy */ -#define NPY_ITER_WRITEMASKED 0x10000000 -/* This array is the mask for all WRITEMASKED operands */ -#define NPY_ITER_ARRAYMASK 0x20000000 - -#define NPY_ITER_GLOBAL_FLAGS 0x0000ffff -#define NPY_ITER_PER_OP_FLAGS 0xffff0000 - - -/***************************** - * Basic iterator object - *****************************/ - -/* FWD declaration */ -typedef struct PyArrayIterObject_tag PyArrayIterObject; - -/* - * type of the function which translates a set of coordinates to a - * pointer to the data - */ -typedef char* (*npy_iter_get_dataptr_t)(PyArrayIterObject* iter, npy_intp*); - -struct PyArrayIterObject_tag { - PyObject_HEAD - int nd_m1; /* number of dimensions - 1 */ - npy_intp index, size; - npy_intp coordinates[NPY_MAXDIMS];/* N-dimensional loop */ - npy_intp dims_m1[NPY_MAXDIMS]; /* ao->dimensions - 1 */ - npy_intp strides[NPY_MAXDIMS]; /* ao->strides or fake */ - npy_intp backstrides[NPY_MAXDIMS];/* how far to jump back */ - npy_intp factors[NPY_MAXDIMS]; /* shape factors */ - PyArrayObject *ao; - char *dataptr; /* pointer to current item*/ - npy_bool contiguous; - - npy_intp bounds[NPY_MAXDIMS][2]; - npy_intp limits[NPY_MAXDIMS][2]; - npy_intp limits_sizes[NPY_MAXDIMS]; - npy_iter_get_dataptr_t translate; -} ; - - -/* Iterator API */ -#define PyArrayIter_Check(op) PyObject_TypeCheck(op, &PyArrayIter_Type) - -#define _PyAIT(it) ((PyArrayIterObject *)(it)) -#define PyArray_ITER_RESET(it) do { \ - _PyAIT(it)->index = 0; \ - _PyAIT(it)->dataptr = PyArray_BYTES(_PyAIT(it)->ao); \ - memset(_PyAIT(it)->coordinates, 0, \ - (_PyAIT(it)->nd_m1+1)*sizeof(npy_intp)); \ -} while (0) - -#define _PyArray_ITER_NEXT1(it) do { \ - (it)->dataptr += _PyAIT(it)->strides[0]; \ - (it)->coordinates[0]++; \ -} while (0) - -#define _PyArray_ITER_NEXT2(it) do { \ - if ((it)->coordinates[1] < (it)->dims_m1[1]) { \ - (it)->coordinates[1]++; \ - (it)->dataptr += (it)->strides[1]; \ - } \ - else { \ - (it)->coordinates[1] = 0; \ - (it)->coordinates[0]++; \ - (it)->dataptr += (it)->strides[0] - \ - (it)->backstrides[1]; \ - } \ -} while (0) - -#define _PyArray_ITER_NEXT3(it) do { \ - if ((it)->coordinates[2] < (it)->dims_m1[2]) { \ - (it)->coordinates[2]++; \ - (it)->dataptr += (it)->strides[2]; \ - } \ - else { \ - (it)->coordinates[2] = 0; \ - (it)->dataptr -= (it)->backstrides[2]; \ - if ((it)->coordinates[1] < (it)->dims_m1[1]) { \ - (it)->coordinates[1]++; \ - (it)->dataptr += (it)->strides[1]; \ - } \ - else { \ - (it)->coordinates[1] = 0; \ - (it)->coordinates[0]++; \ - (it)->dataptr += (it)->strides[0] \ - (it)->backstrides[1]; \ - } \ - } \ -} while (0) - -#define PyArray_ITER_NEXT(it) do { \ - _PyAIT(it)->index++; \ - if (_PyAIT(it)->nd_m1 == 0) { \ - _PyArray_ITER_NEXT1(_PyAIT(it)); \ - } \ - else if (_PyAIT(it)->contiguous) \ - _PyAIT(it)->dataptr += PyArray_DESCR(_PyAIT(it)->ao)->elsize; \ - else if (_PyAIT(it)->nd_m1 == 1) { \ - _PyArray_ITER_NEXT2(_PyAIT(it)); \ - } \ - else { \ - int __npy_i; \ - for (__npy_i=_PyAIT(it)->nd_m1; __npy_i >= 0; __npy_i--) { \ - if (_PyAIT(it)->coordinates[__npy_i] < \ - _PyAIT(it)->dims_m1[__npy_i]) { \ - _PyAIT(it)->coordinates[__npy_i]++; \ - _PyAIT(it)->dataptr += \ - _PyAIT(it)->strides[__npy_i]; \ - break; \ - } \ - else { \ - _PyAIT(it)->coordinates[__npy_i] = 0; \ - _PyAIT(it)->dataptr -= \ - _PyAIT(it)->backstrides[__npy_i]; \ - } \ - } \ - } \ -} while (0) - -#define PyArray_ITER_GOTO(it, destination) do { \ - int __npy_i; \ - _PyAIT(it)->index = 0; \ - _PyAIT(it)->dataptr = PyArray_BYTES(_PyAIT(it)->ao); \ - for (__npy_i = _PyAIT(it)->nd_m1; __npy_i>=0; __npy_i--) { \ - if (destination[__npy_i] < 0) { \ - destination[__npy_i] += \ - _PyAIT(it)->dims_m1[__npy_i]+1; \ - } \ - _PyAIT(it)->dataptr += destination[__npy_i] * \ - _PyAIT(it)->strides[__npy_i]; \ - _PyAIT(it)->coordinates[__npy_i] = \ - destination[__npy_i]; \ - _PyAIT(it)->index += destination[__npy_i] * \ - ( __npy_i==_PyAIT(it)->nd_m1 ? 1 : \ - _PyAIT(it)->dims_m1[__npy_i+1]+1) ; \ - } \ -} while (0) - -#define PyArray_ITER_GOTO1D(it, ind) do { \ - int __npy_i; \ - npy_intp __npy_ind = (npy_intp) (ind); \ - if (__npy_ind < 0) __npy_ind += _PyAIT(it)->size; \ - _PyAIT(it)->index = __npy_ind; \ - if (_PyAIT(it)->nd_m1 == 0) { \ - _PyAIT(it)->dataptr = PyArray_BYTES(_PyAIT(it)->ao) + \ - __npy_ind * _PyAIT(it)->strides[0]; \ - } \ - else if (_PyAIT(it)->contiguous) \ - _PyAIT(it)->dataptr = PyArray_BYTES(_PyAIT(it)->ao) + \ - __npy_ind * PyArray_DESCR(_PyAIT(it)->ao)->elsize; \ - else { \ - _PyAIT(it)->dataptr = PyArray_BYTES(_PyAIT(it)->ao); \ - for (__npy_i = 0; __npy_i<=_PyAIT(it)->nd_m1; \ - __npy_i++) { \ - _PyAIT(it)->dataptr += \ - (__npy_ind / _PyAIT(it)->factors[__npy_i]) \ - * _PyAIT(it)->strides[__npy_i]; \ - __npy_ind %= _PyAIT(it)->factors[__npy_i]; \ - } \ - } \ -} while (0) - -#define PyArray_ITER_DATA(it) ((void *)(_PyAIT(it)->dataptr)) - -#define PyArray_ITER_NOTDONE(it) (_PyAIT(it)->index < _PyAIT(it)->size) - - -/* - * Any object passed to PyArray_Broadcast must be binary compatible - * with this structure. - */ - -typedef struct { - PyObject_HEAD - int numiter; /* number of iters */ - npy_intp size; /* broadcasted size */ - npy_intp index; /* current index */ - int nd; /* number of dims */ - npy_intp dimensions[NPY_MAXDIMS]; /* dimensions */ - PyArrayIterObject *iters[NPY_MAXARGS]; /* iterators */ -} PyArrayMultiIterObject; - -#define _PyMIT(m) ((PyArrayMultiIterObject *)(m)) -#define PyArray_MultiIter_RESET(multi) do { \ - int __npy_mi; \ - _PyMIT(multi)->index = 0; \ - for (__npy_mi=0; __npy_mi < _PyMIT(multi)->numiter; __npy_mi++) { \ - PyArray_ITER_RESET(_PyMIT(multi)->iters[__npy_mi]); \ - } \ -} while (0) - -#define PyArray_MultiIter_NEXT(multi) do { \ - int __npy_mi; \ - _PyMIT(multi)->index++; \ - for (__npy_mi=0; __npy_mi < _PyMIT(multi)->numiter; __npy_mi++) { \ - PyArray_ITER_NEXT(_PyMIT(multi)->iters[__npy_mi]); \ - } \ -} while (0) - -#define PyArray_MultiIter_GOTO(multi, dest) do { \ - int __npy_mi; \ - for (__npy_mi=0; __npy_mi < _PyMIT(multi)->numiter; __npy_mi++) { \ - PyArray_ITER_GOTO(_PyMIT(multi)->iters[__npy_mi], dest); \ - } \ - _PyMIT(multi)->index = _PyMIT(multi)->iters[0]->index; \ -} while (0) - -#define PyArray_MultiIter_GOTO1D(multi, ind) do { \ - int __npy_mi; \ - for (__npy_mi=0; __npy_mi < _PyMIT(multi)->numiter; __npy_mi++) { \ - PyArray_ITER_GOTO1D(_PyMIT(multi)->iters[__npy_mi], ind); \ - } \ - _PyMIT(multi)->index = _PyMIT(multi)->iters[0]->index; \ -} while (0) - -#define PyArray_MultiIter_DATA(multi, i) \ - ((void *)(_PyMIT(multi)->iters[i]->dataptr)) - -#define PyArray_MultiIter_NEXTi(multi, i) \ - PyArray_ITER_NEXT(_PyMIT(multi)->iters[i]) - -#define PyArray_MultiIter_NOTDONE(multi) \ - (_PyMIT(multi)->index < _PyMIT(multi)->size) - -/* Store the information needed for fancy-indexing over an array */ - -typedef struct { - PyObject_HEAD - /* - * Multi-iterator portion --- needs to be present in this - * order to work with PyArray_Broadcast - */ - - int numiter; /* number of index-array - iterators */ - npy_intp size; /* size of broadcasted - result */ - npy_intp index; /* current index */ - int nd; /* number of dims */ - npy_intp dimensions[NPY_MAXDIMS]; /* dimensions */ - PyArrayIterObject *iters[NPY_MAXDIMS]; /* index object - iterators */ - PyArrayIterObject *ait; /* flat Iterator for - underlying array */ - - /* flat iterator for subspace (when numiter < nd) */ - PyArrayIterObject *subspace; - - /* - * if subspace iteration, then this is the array of axes in - * the underlying array represented by the index objects - */ - int iteraxes[NPY_MAXDIMS]; - /* - * if subspace iteration, the these are the coordinates to the - * start of the subspace. - */ - npy_intp bscoord[NPY_MAXDIMS]; - - PyObject *indexobj; /* creating obj */ - int consec; - char *dataptr; - -} PyArrayMapIterObject; - -enum { - NPY_NEIGHBORHOOD_ITER_ZERO_PADDING, - NPY_NEIGHBORHOOD_ITER_ONE_PADDING, - NPY_NEIGHBORHOOD_ITER_CONSTANT_PADDING, - NPY_NEIGHBORHOOD_ITER_CIRCULAR_PADDING, - NPY_NEIGHBORHOOD_ITER_MIRROR_PADDING -}; - -typedef struct { - PyObject_HEAD - - /* - * PyArrayIterObject part: keep this in this exact order - */ - int nd_m1; /* number of dimensions - 1 */ - npy_intp index, size; - npy_intp coordinates[NPY_MAXDIMS];/* N-dimensional loop */ - npy_intp dims_m1[NPY_MAXDIMS]; /* ao->dimensions - 1 */ - npy_intp strides[NPY_MAXDIMS]; /* ao->strides or fake */ - npy_intp backstrides[NPY_MAXDIMS];/* how far to jump back */ - npy_intp factors[NPY_MAXDIMS]; /* shape factors */ - PyArrayObject *ao; - char *dataptr; /* pointer to current item*/ - npy_bool contiguous; - - npy_intp bounds[NPY_MAXDIMS][2]; - npy_intp limits[NPY_MAXDIMS][2]; - npy_intp limits_sizes[NPY_MAXDIMS]; - npy_iter_get_dataptr_t translate; - - /* - * New members - */ - npy_intp nd; - - /* Dimensions is the dimension of the array */ - npy_intp dimensions[NPY_MAXDIMS]; - - /* - * Neighborhood points coordinates are computed relatively to the - * point pointed by _internal_iter - */ - PyArrayIterObject* _internal_iter; - /* - * To keep a reference to the representation of the constant value - * for constant padding - */ - char* constant; - - int mode; -} PyArrayNeighborhoodIterObject; - -/* - * Neighborhood iterator API - */ - -/* General: those work for any mode */ -static NPY_INLINE int -PyArrayNeighborhoodIter_Reset(PyArrayNeighborhoodIterObject* iter); -static NPY_INLINE int -PyArrayNeighborhoodIter_Next(PyArrayNeighborhoodIterObject* iter); -#if 0 -static NPY_INLINE int -PyArrayNeighborhoodIter_Next2D(PyArrayNeighborhoodIterObject* iter); -#endif - -/* - * Include inline implementations - functions defined there are not - * considered public API - */ -#define _NPY_INCLUDE_NEIGHBORHOOD_IMP -#include "_neighborhood_iterator_imp.h" -#undef _NPY_INCLUDE_NEIGHBORHOOD_IMP - -/* The default array type */ -#define NPY_DEFAULT_TYPE NPY_DOUBLE - -/* - * All sorts of useful ways to look into a PyArrayObject. It is recommended - * to use PyArrayObject * objects instead of always casting from PyObject *, - * for improved type checking. - * - * In many cases here the macro versions of the accessors are deprecated, - * but can't be immediately changed to inline functions because the - * preexisting macros accept PyObject * and do automatic casts. Inline - * functions accepting PyArrayObject * provides for some compile-time - * checking of correctness when working with these objects in C. - */ - -#define PyArray_ISONESEGMENT(m) (PyArray_NDIM(m) == 0 || \ - PyArray_CHKFLAGS(m, NPY_ARRAY_C_CONTIGUOUS) || \ - PyArray_CHKFLAGS(m, NPY_ARRAY_F_CONTIGUOUS)) - -#define PyArray_ISFORTRAN(m) (PyArray_CHKFLAGS(m, NPY_ARRAY_F_CONTIGUOUS) && \ - (PyArray_NDIM(m) > 1)) - -#define PyArray_FORTRAN_IF(m) ((PyArray_CHKFLAGS(m, NPY_ARRAY_F_CONTIGUOUS) ? \ - NPY_ARRAY_F_CONTIGUOUS : 0)) - -#if (defined(NPY_NO_DEPRECATED_API) && (NPY_API_VERSION <= NPY_NO_DEPRECATED_API)) -/* - * Changing access macros into functions, to allow for future hiding - * of the internal memory layout. This later hiding will allow the 2.x series - * to change the internal representation of arrays without affecting - * ABI compatibility. - */ - -static NPY_INLINE int -PyArray_NDIM(const PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->nd; -} - -static NPY_INLINE void * -PyArray_DATA(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->data; -} - -static NPY_INLINE char * -PyArray_BYTES(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->data; -} - -static NPY_INLINE npy_intp * -PyArray_DIMS(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->dimensions; -} - -static NPY_INLINE npy_intp * -PyArray_STRIDES(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->strides; -} - -static NPY_INLINE npy_intp -PyArray_DIM(const PyArrayObject *arr, int idim) -{ - return ((PyArrayObject_fields *)arr)->dimensions[idim]; -} - -static NPY_INLINE npy_intp -PyArray_STRIDE(const PyArrayObject *arr, int istride) -{ - return ((PyArrayObject_fields *)arr)->strides[istride]; -} - -static NPY_INLINE PyObject * -PyArray_BASE(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->base; -} - -static NPY_INLINE PyArray_Descr * -PyArray_DESCR(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->descr; -} - -static NPY_INLINE int -PyArray_FLAGS(const PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->flags; -} - -static NPY_INLINE npy_intp -PyArray_ITEMSIZE(const PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->descr->elsize; -} - -static NPY_INLINE int -PyArray_TYPE(const PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->descr->type_num; -} - -static NPY_INLINE int -PyArray_CHKFLAGS(const PyArrayObject *arr, int flags) -{ - return (PyArray_FLAGS(arr) & flags) == flags; -} - -static NPY_INLINE PyObject * -PyArray_GETITEM(const PyArrayObject *arr, const char *itemptr) -{ - return ((PyArrayObject_fields *)arr)->descr->f->getitem( - (void *)itemptr, (PyArrayObject *)arr); -} - -static NPY_INLINE int -PyArray_SETITEM(PyArrayObject *arr, char *itemptr, PyObject *v) -{ - return ((PyArrayObject_fields *)arr)->descr->f->setitem( - v, itemptr, arr); -} - -#else - -/* These macros are deprecated as of NumPy 1.7. */ -#define PyArray_NDIM(obj) (((PyArrayObject_fields *)(obj))->nd) -#define PyArray_BYTES(obj) (((PyArrayObject_fields *)(obj))->data) -#define PyArray_DATA(obj) ((void *)((PyArrayObject_fields *)(obj))->data) -#define PyArray_DIMS(obj) (((PyArrayObject_fields *)(obj))->dimensions) -#define PyArray_STRIDES(obj) (((PyArrayObject_fields *)(obj))->strides) -#define PyArray_DIM(obj,n) (PyArray_DIMS(obj)[n]) -#define PyArray_STRIDE(obj,n) (PyArray_STRIDES(obj)[n]) -#define PyArray_BASE(obj) (((PyArrayObject_fields *)(obj))->base) -#define PyArray_DESCR(obj) (((PyArrayObject_fields *)(obj))->descr) -#define PyArray_FLAGS(obj) (((PyArrayObject_fields *)(obj))->flags) -#define PyArray_CHKFLAGS(m, FLAGS) \ - ((((PyArrayObject_fields *)(m))->flags & (FLAGS)) == (FLAGS)) -#define PyArray_ITEMSIZE(obj) \ - (((PyArrayObject_fields *)(obj))->descr->elsize) -#define PyArray_TYPE(obj) \ - (((PyArrayObject_fields *)(obj))->descr->type_num) -#define PyArray_GETITEM(obj,itemptr) \ - PyArray_DESCR(obj)->f->getitem((char *)(itemptr), \ - (PyArrayObject *)(obj)) - -#define PyArray_SETITEM(obj,itemptr,v) \ - PyArray_DESCR(obj)->f->setitem((PyObject *)(v), \ - (char *)(itemptr), \ - (PyArrayObject *)(obj)) -#endif - -static NPY_INLINE PyArray_Descr * -PyArray_DTYPE(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->descr; -} - -static NPY_INLINE npy_intp * -PyArray_SHAPE(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->dimensions; -} - -/* - * Enables the specified array flags. Does no checking, - * assumes you know what you're doing. - */ -static NPY_INLINE void -PyArray_ENABLEFLAGS(PyArrayObject *arr, int flags) -{ - ((PyArrayObject_fields *)arr)->flags |= flags; -} - -/* - * Clears the specified array flags. Does no checking, - * assumes you know what you're doing. - */ -static NPY_INLINE void -PyArray_CLEARFLAGS(PyArrayObject *arr, int flags) -{ - ((PyArrayObject_fields *)arr)->flags &= ~flags; -} - -#define PyTypeNum_ISBOOL(type) ((type) == NPY_BOOL) - -#define PyTypeNum_ISUNSIGNED(type) (((type) == NPY_UBYTE) || \ - ((type) == NPY_USHORT) || \ - ((type) == NPY_UINT) || \ - ((type) == NPY_ULONG) || \ - ((type) == NPY_ULONGLONG)) - -#define PyTypeNum_ISSIGNED(type) (((type) == NPY_BYTE) || \ - ((type) == NPY_SHORT) || \ - ((type) == NPY_INT) || \ - ((type) == NPY_LONG) || \ - ((type) == NPY_LONGLONG)) - -#define PyTypeNum_ISINTEGER(type) (((type) >= NPY_BYTE) && \ - ((type) <= NPY_ULONGLONG)) - -#define PyTypeNum_ISFLOAT(type) ((((type) >= NPY_FLOAT) && \ - ((type) <= NPY_LONGDOUBLE)) || \ - ((type) == NPY_HALF)) - -#define PyTypeNum_ISNUMBER(type) (((type) <= NPY_CLONGDOUBLE) || \ - ((type) == NPY_HALF)) - -#define PyTypeNum_ISSTRING(type) (((type) == NPY_STRING) || \ - ((type) == NPY_UNICODE)) - -#define PyTypeNum_ISCOMPLEX(type) (((type) >= NPY_CFLOAT) && \ - ((type) <= NPY_CLONGDOUBLE)) - -#define PyTypeNum_ISPYTHON(type) (((type) == NPY_LONG) || \ - ((type) == NPY_DOUBLE) || \ - ((type) == NPY_CDOUBLE) || \ - ((type) == NPY_BOOL) || \ - ((type) == NPY_OBJECT )) - -#define PyTypeNum_ISFLEXIBLE(type) (((type) >=NPY_STRING) && \ - ((type) <=NPY_VOID)) - -#define PyTypeNum_ISDATETIME(type) (((type) >=NPY_DATETIME) && \ - ((type) <=NPY_TIMEDELTA)) - -#define PyTypeNum_ISUSERDEF(type) (((type) >= NPY_USERDEF) && \ - ((type) < NPY_USERDEF+ \ - NPY_NUMUSERTYPES)) - -#define PyTypeNum_ISEXTENDED(type) (PyTypeNum_ISFLEXIBLE(type) || \ - PyTypeNum_ISUSERDEF(type)) - -#define PyTypeNum_ISOBJECT(type) ((type) == NPY_OBJECT) - - -#define PyDataType_ISBOOL(obj) PyTypeNum_ISBOOL(_PyADt(obj)) -#define PyDataType_ISUNSIGNED(obj) PyTypeNum_ISUNSIGNED(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISSIGNED(obj) PyTypeNum_ISSIGNED(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISINTEGER(obj) PyTypeNum_ISINTEGER(((PyArray_Descr*)(obj))->type_num ) -#define PyDataType_ISFLOAT(obj) PyTypeNum_ISFLOAT(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISNUMBER(obj) PyTypeNum_ISNUMBER(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISSTRING(obj) PyTypeNum_ISSTRING(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISCOMPLEX(obj) PyTypeNum_ISCOMPLEX(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISPYTHON(obj) PyTypeNum_ISPYTHON(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISFLEXIBLE(obj) PyTypeNum_ISFLEXIBLE(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISDATETIME(obj) PyTypeNum_ISDATETIME(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISUSERDEF(obj) PyTypeNum_ISUSERDEF(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISEXTENDED(obj) PyTypeNum_ISEXTENDED(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISOBJECT(obj) PyTypeNum_ISOBJECT(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_HASFIELDS(obj) (((PyArray_Descr *)(obj))->names != NULL) -#define PyDataType_HASSUBARRAY(dtype) ((dtype)->subarray != NULL) - -#define PyArray_ISBOOL(obj) PyTypeNum_ISBOOL(PyArray_TYPE(obj)) -#define PyArray_ISUNSIGNED(obj) PyTypeNum_ISUNSIGNED(PyArray_TYPE(obj)) -#define PyArray_ISSIGNED(obj) PyTypeNum_ISSIGNED(PyArray_TYPE(obj)) -#define PyArray_ISINTEGER(obj) PyTypeNum_ISINTEGER(PyArray_TYPE(obj)) -#define PyArray_ISFLOAT(obj) PyTypeNum_ISFLOAT(PyArray_TYPE(obj)) -#define PyArray_ISNUMBER(obj) PyTypeNum_ISNUMBER(PyArray_TYPE(obj)) -#define PyArray_ISSTRING(obj) PyTypeNum_ISSTRING(PyArray_TYPE(obj)) -#define PyArray_ISCOMPLEX(obj) PyTypeNum_ISCOMPLEX(PyArray_TYPE(obj)) -#define PyArray_ISPYTHON(obj) PyTypeNum_ISPYTHON(PyArray_TYPE(obj)) -#define PyArray_ISFLEXIBLE(obj) PyTypeNum_ISFLEXIBLE(PyArray_TYPE(obj)) -#define PyArray_ISDATETIME(obj) PyTypeNum_ISDATETIME(PyArray_TYPE(obj)) -#define PyArray_ISUSERDEF(obj) PyTypeNum_ISUSERDEF(PyArray_TYPE(obj)) -#define PyArray_ISEXTENDED(obj) PyTypeNum_ISEXTENDED(PyArray_TYPE(obj)) -#define PyArray_ISOBJECT(obj) PyTypeNum_ISOBJECT(PyArray_TYPE(obj)) -#define PyArray_HASFIELDS(obj) PyDataType_HASFIELDS(PyArray_DESCR(obj)) - - /* - * FIXME: This should check for a flag on the data-type that - * states whether or not it is variable length. Because the - * ISFLEXIBLE check is hard-coded to the built-in data-types. - */ -#define PyArray_ISVARIABLE(obj) PyTypeNum_ISFLEXIBLE(PyArray_TYPE(obj)) - -#define PyArray_SAFEALIGNEDCOPY(obj) (PyArray_ISALIGNED(obj) && !PyArray_ISVARIABLE(obj)) - - -#define NPY_LITTLE '<' -#define NPY_BIG '>' -#define NPY_NATIVE '=' -#define NPY_SWAP 's' -#define NPY_IGNORE '|' - -#if NPY_BYTE_ORDER == NPY_BIG_ENDIAN -#define NPY_NATBYTE NPY_BIG -#define NPY_OPPBYTE NPY_LITTLE -#else -#define NPY_NATBYTE NPY_LITTLE -#define NPY_OPPBYTE NPY_BIG -#endif - -#define PyArray_ISNBO(arg) ((arg) != NPY_OPPBYTE) -#define PyArray_IsNativeByteOrder PyArray_ISNBO -#define PyArray_ISNOTSWAPPED(m) PyArray_ISNBO(PyArray_DESCR(m)->byteorder) -#define PyArray_ISBYTESWAPPED(m) (!PyArray_ISNOTSWAPPED(m)) - -#define PyArray_FLAGSWAP(m, flags) (PyArray_CHKFLAGS(m, flags) && \ - PyArray_ISNOTSWAPPED(m)) - -#define PyArray_ISCARRAY(m) PyArray_FLAGSWAP(m, NPY_ARRAY_CARRAY) -#define PyArray_ISCARRAY_RO(m) PyArray_FLAGSWAP(m, NPY_ARRAY_CARRAY_RO) -#define PyArray_ISFARRAY(m) PyArray_FLAGSWAP(m, NPY_ARRAY_FARRAY) -#define PyArray_ISFARRAY_RO(m) PyArray_FLAGSWAP(m, NPY_ARRAY_FARRAY_RO) -#define PyArray_ISBEHAVED(m) PyArray_FLAGSWAP(m, NPY_ARRAY_BEHAVED) -#define PyArray_ISBEHAVED_RO(m) PyArray_FLAGSWAP(m, NPY_ARRAY_ALIGNED) - - -#define PyDataType_ISNOTSWAPPED(d) PyArray_ISNBO(((PyArray_Descr *)(d))->byteorder) -#define PyDataType_ISBYTESWAPPED(d) (!PyDataType_ISNOTSWAPPED(d)) - -/************************************************************ - * A struct used by PyArray_CreateSortedStridePerm, new in 1.7. - ************************************************************/ - -typedef struct { - npy_intp perm, stride; -} npy_stride_sort_item; - -/************************************************************ - * This is the form of the struct that's returned pointed by the - * PyCObject attribute of an array __array_struct__. See - * http://docs.scipy.org/doc/numpy/reference/arrays.interface.html for the full - * documentation. - ************************************************************/ -typedef struct { - int two; /* - * contains the integer 2 as a sanity - * check - */ - - int nd; /* number of dimensions */ - - char typekind; /* - * kind in array --- character code of - * typestr - */ - - int itemsize; /* size of each element */ - - int flags; /* - * how should be data interpreted. Valid - * flags are CONTIGUOUS (1), F_CONTIGUOUS (2), - * ALIGNED (0x100), NOTSWAPPED (0x200), and - * WRITEABLE (0x400). ARR_HAS_DESCR (0x800) - * states that arrdescr field is present in - * structure - */ - - npy_intp *shape; /* - * A length-nd array of shape - * information - */ - - npy_intp *strides; /* A length-nd array of stride information */ - - void *data; /* A pointer to the first element of the array */ - - PyObject *descr; /* - * A list of fields or NULL (ignored if flags - * does not have ARR_HAS_DESCR flag set) - */ -} PyArrayInterface; - -/* - * This is a function for hooking into the PyDataMem_NEW/FREE/RENEW functions. - * See the documentation for PyDataMem_SetEventHook. - */ -typedef void (PyDataMem_EventHookFunc)(void *inp, void *outp, size_t size, - void *user_data); - -#if !(defined(NPY_NO_DEPRECATED_API) && (NPY_API_VERSION <= NPY_NO_DEPRECATED_API)) -#include "npy_deprecated_api.h" -#endif - -#endif /* NPY_ARRAYTYPES_H */ diff --git a/include/numpy/noprefix.h b/include/numpy/noprefix.h deleted file mode 100644 index b3e57480e..000000000 --- a/include/numpy/noprefix.h +++ /dev/null @@ -1,209 +0,0 @@ -#ifndef NPY_NOPREFIX_H -#define NPY_NOPREFIX_H - -/* - * You can directly include noprefix.h as a backward - * compatibility measure - */ -#ifndef NPY_NO_PREFIX -#include "ndarrayobject.h" -#include "npy_interrupt.h" -#endif - -#define SIGSETJMP NPY_SIGSETJMP -#define SIGLONGJMP NPY_SIGLONGJMP -#define SIGJMP_BUF NPY_SIGJMP_BUF - -#define MAX_DIMS NPY_MAXDIMS - -#define longlong npy_longlong -#define ulonglong npy_ulonglong -#define Bool npy_bool -#define longdouble npy_longdouble -#define byte npy_byte - -#ifndef _BSD_SOURCE -#define ushort npy_ushort -#define uint npy_uint -#define ulong npy_ulong -#endif - -#define ubyte npy_ubyte -#define ushort npy_ushort -#define uint npy_uint -#define ulong npy_ulong -#define cfloat npy_cfloat -#define cdouble npy_cdouble -#define clongdouble npy_clongdouble -#define Int8 npy_int8 -#define UInt8 npy_uint8 -#define Int16 npy_int16 -#define UInt16 npy_uint16 -#define Int32 npy_int32 -#define UInt32 npy_uint32 -#define Int64 npy_int64 -#define UInt64 npy_uint64 -#define Int128 npy_int128 -#define UInt128 npy_uint128 -#define Int256 npy_int256 -#define UInt256 npy_uint256 -#define Float16 npy_float16 -#define Complex32 npy_complex32 -#define Float32 npy_float32 -#define Complex64 npy_complex64 -#define Float64 npy_float64 -#define Complex128 npy_complex128 -#define Float80 npy_float80 -#define Complex160 npy_complex160 -#define Float96 npy_float96 -#define Complex192 npy_complex192 -#define Float128 npy_float128 -#define Complex256 npy_complex256 -#define intp npy_intp -#define uintp npy_uintp -#define datetime npy_datetime -#define timedelta npy_timedelta - -#define SIZEOF_INTP NPY_SIZEOF_INTP -#define SIZEOF_UINTP NPY_SIZEOF_UINTP -#define SIZEOF_DATETIME NPY_SIZEOF_DATETIME -#define SIZEOF_TIMEDELTA NPY_SIZEOF_TIMEDELTA - -#define LONGLONG_FMT NPY_LONGLONG_FMT -#define ULONGLONG_FMT NPY_ULONGLONG_FMT -#define LONGLONG_SUFFIX NPY_LONGLONG_SUFFIX -#define ULONGLONG_SUFFIX NPY_ULONGLONG_SUFFIX - -#define MAX_INT8 127 -#define MIN_INT8 -128 -#define MAX_UINT8 255 -#define MAX_INT16 32767 -#define MIN_INT16 -32768 -#define MAX_UINT16 65535 -#define MAX_INT32 2147483647 -#define MIN_INT32 (-MAX_INT32 - 1) -#define MAX_UINT32 4294967295U -#define MAX_INT64 LONGLONG_SUFFIX(9223372036854775807) -#define MIN_INT64 (-MAX_INT64 - LONGLONG_SUFFIX(1)) -#define MAX_UINT64 ULONGLONG_SUFFIX(18446744073709551615) -#define MAX_INT128 LONGLONG_SUFFIX(85070591730234615865843651857942052864) -#define MIN_INT128 (-MAX_INT128 - LONGLONG_SUFFIX(1)) -#define MAX_UINT128 ULONGLONG_SUFFIX(170141183460469231731687303715884105728) -#define MAX_INT256 LONGLONG_SUFFIX(57896044618658097711785492504343953926634992332820282019728792003956564819967) -#define MIN_INT256 (-MAX_INT256 - LONGLONG_SUFFIX(1)) -#define MAX_UINT256 ULONGLONG_SUFFIX(115792089237316195423570985008687907853269984665640564039457584007913129639935) - -#define MAX_BYTE NPY_MAX_BYTE -#define MIN_BYTE NPY_MIN_BYTE -#define MAX_UBYTE NPY_MAX_UBYTE -#define MAX_SHORT NPY_MAX_SHORT -#define MIN_SHORT NPY_MIN_SHORT -#define MAX_USHORT NPY_MAX_USHORT -#define MAX_INT NPY_MAX_INT -#define MIN_INT NPY_MIN_INT -#define MAX_UINT NPY_MAX_UINT -#define MAX_LONG NPY_MAX_LONG -#define MIN_LONG NPY_MIN_LONG -#define MAX_ULONG NPY_MAX_ULONG -#define MAX_LONGLONG NPY_MAX_LONGLONG -#define MIN_LONGLONG NPY_MIN_LONGLONG -#define MAX_ULONGLONG NPY_MAX_ULONGLONG -#define MIN_DATETIME NPY_MIN_DATETIME -#define MAX_DATETIME NPY_MAX_DATETIME -#define MIN_TIMEDELTA NPY_MIN_TIMEDELTA -#define MAX_TIMEDELTA NPY_MAX_TIMEDELTA - -#define SIZEOF_LONGDOUBLE NPY_SIZEOF_LONGDOUBLE -#define SIZEOF_LONGLONG NPY_SIZEOF_LONGLONG -#define SIZEOF_HALF NPY_SIZEOF_HALF -#define BITSOF_BOOL NPY_BITSOF_BOOL -#define BITSOF_CHAR NPY_BITSOF_CHAR -#define BITSOF_SHORT NPY_BITSOF_SHORT -#define BITSOF_INT NPY_BITSOF_INT -#define BITSOF_LONG NPY_BITSOF_LONG -#define BITSOF_LONGLONG NPY_BITSOF_LONGLONG -#define BITSOF_HALF NPY_BITSOF_HALF -#define BITSOF_FLOAT NPY_BITSOF_FLOAT -#define BITSOF_DOUBLE NPY_BITSOF_DOUBLE -#define BITSOF_LONGDOUBLE NPY_BITSOF_LONGDOUBLE -#define BITSOF_DATETIME NPY_BITSOF_DATETIME -#define BITSOF_TIMEDELTA NPY_BITSOF_TIMEDELTA - -#define _pya_malloc PyArray_malloc -#define _pya_free PyArray_free -#define _pya_realloc PyArray_realloc - -#define BEGIN_THREADS_DEF NPY_BEGIN_THREADS_DEF -#define BEGIN_THREADS NPY_BEGIN_THREADS -#define END_THREADS NPY_END_THREADS -#define ALLOW_C_API_DEF NPY_ALLOW_C_API_DEF -#define ALLOW_C_API NPY_ALLOW_C_API -#define DISABLE_C_API NPY_DISABLE_C_API - -#define PY_FAIL NPY_FAIL -#define PY_SUCCEED NPY_SUCCEED - -#ifndef TRUE -#define TRUE NPY_TRUE -#endif - -#ifndef FALSE -#define FALSE NPY_FALSE -#endif - -#define LONGDOUBLE_FMT NPY_LONGDOUBLE_FMT - -#define CONTIGUOUS NPY_CONTIGUOUS -#define C_CONTIGUOUS NPY_C_CONTIGUOUS -#define FORTRAN NPY_FORTRAN -#define F_CONTIGUOUS NPY_F_CONTIGUOUS -#define OWNDATA NPY_OWNDATA -#define FORCECAST NPY_FORCECAST -#define ENSURECOPY NPY_ENSURECOPY -#define ENSUREARRAY NPY_ENSUREARRAY -#define ELEMENTSTRIDES NPY_ELEMENTSTRIDES -#define ALIGNED NPY_ALIGNED -#define NOTSWAPPED NPY_NOTSWAPPED -#define WRITEABLE NPY_WRITEABLE -#define UPDATEIFCOPY NPY_UPDATEIFCOPY -#define ARR_HAS_DESCR NPY_ARR_HAS_DESCR -#define BEHAVED NPY_BEHAVED -#define BEHAVED_NS NPY_BEHAVED_NS -#define CARRAY NPY_CARRAY -#define CARRAY_RO NPY_CARRAY_RO -#define FARRAY NPY_FARRAY -#define FARRAY_RO NPY_FARRAY_RO -#define DEFAULT NPY_DEFAULT -#define IN_ARRAY NPY_IN_ARRAY -#define OUT_ARRAY NPY_OUT_ARRAY -#define INOUT_ARRAY NPY_INOUT_ARRAY -#define IN_FARRAY NPY_IN_FARRAY -#define OUT_FARRAY NPY_OUT_FARRAY -#define INOUT_FARRAY NPY_INOUT_FARRAY -#define UPDATE_ALL NPY_UPDATE_ALL - -#define OWN_DATA NPY_OWNDATA -#define BEHAVED_FLAGS NPY_BEHAVED -#define BEHAVED_FLAGS_NS NPY_BEHAVED_NS -#define CARRAY_FLAGS_RO NPY_CARRAY_RO -#define CARRAY_FLAGS NPY_CARRAY -#define FARRAY_FLAGS NPY_FARRAY -#define FARRAY_FLAGS_RO NPY_FARRAY_RO -#define DEFAULT_FLAGS NPY_DEFAULT -#define UPDATE_ALL_FLAGS NPY_UPDATE_ALL_FLAGS - -#ifndef MIN -#define MIN PyArray_MIN -#endif -#ifndef MAX -#define MAX PyArray_MAX -#endif -#define MAX_INTP NPY_MAX_INTP -#define MIN_INTP NPY_MIN_INTP -#define MAX_UINTP NPY_MAX_UINTP -#define INTP_FMT NPY_INTP_FMT - -#define REFCOUNT PyArray_REFCOUNT -#define MAX_ELSIZE NPY_MAX_ELSIZE - -#endif diff --git a/include/numpy/npy_3kcompat.h b/include/numpy/npy_3kcompat.h deleted file mode 100644 index d0cd9ac1a..000000000 --- a/include/numpy/npy_3kcompat.h +++ /dev/null @@ -1,417 +0,0 @@ -/* - * This is a convenience header file providing compatibility utilities - * for supporting Python 2 and Python 3 in the same code base. - * - * If you want to use this for your own projects, it's recommended to make a - * copy of it. Although the stuff below is unlikely to change, we don't provide - * strong backwards compatibility guarantees at the moment. - */ - -#ifndef _NPY_3KCOMPAT_H_ -#define _NPY_3KCOMPAT_H_ - -#include -#include - -#if PY_VERSION_HEX >= 0x03000000 -#ifndef NPY_PY3K -#define NPY_PY3K 1 -#endif -#endif - -#include "numpy/npy_common.h" -#include "numpy/ndarrayobject.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * PyInt -> PyLong - */ - -#if defined(NPY_PY3K) -/* Return True only if the long fits in a C long */ -static NPY_INLINE int PyInt_Check(PyObject *op) { - int overflow = 0; - if (!PyLong_Check(op)) { - return 0; - } - PyLong_AsLongAndOverflow(op, &overflow); - return (overflow == 0); -} - -#define PyInt_FromLong PyLong_FromLong -#define PyInt_AsLong PyLong_AsLong -#define PyInt_AS_LONG PyLong_AsLong -#define PyInt_AsSsize_t PyLong_AsSsize_t - -/* NOTE: - * - * Since the PyLong type is very different from the fixed-range PyInt, - * we don't define PyInt_Type -> PyLong_Type. - */ -#endif /* NPY_PY3K */ - -/* - * PyString -> PyBytes - */ - -#if defined(NPY_PY3K) - -#define PyString_Type PyBytes_Type -#define PyString_Check PyBytes_Check -#define PyStringObject PyBytesObject -#define PyString_FromString PyBytes_FromString -#define PyString_FromStringAndSize PyBytes_FromStringAndSize -#define PyString_AS_STRING PyBytes_AS_STRING -#define PyString_AsStringAndSize PyBytes_AsStringAndSize -#define PyString_FromFormat PyBytes_FromFormat -#define PyString_Concat PyBytes_Concat -#define PyString_ConcatAndDel PyBytes_ConcatAndDel -#define PyString_AsString PyBytes_AsString -#define PyString_GET_SIZE PyBytes_GET_SIZE -#define PyString_Size PyBytes_Size - -#define PyUString_Type PyUnicode_Type -#define PyUString_Check PyUnicode_Check -#define PyUStringObject PyUnicodeObject -#define PyUString_FromString PyUnicode_FromString -#define PyUString_FromStringAndSize PyUnicode_FromStringAndSize -#define PyUString_FromFormat PyUnicode_FromFormat -#define PyUString_Concat PyUnicode_Concat2 -#define PyUString_ConcatAndDel PyUnicode_ConcatAndDel -#define PyUString_GET_SIZE PyUnicode_GET_SIZE -#define PyUString_Size PyUnicode_Size -#define PyUString_InternFromString PyUnicode_InternFromString -#define PyUString_Format PyUnicode_Format - -#else - -#define PyBytes_Type PyString_Type -#define PyBytes_Check PyString_Check -#define PyBytesObject PyStringObject -#define PyBytes_FromString PyString_FromString -#define PyBytes_FromStringAndSize PyString_FromStringAndSize -#define PyBytes_AS_STRING PyString_AS_STRING -#define PyBytes_AsStringAndSize PyString_AsStringAndSize -#define PyBytes_FromFormat PyString_FromFormat -#define PyBytes_Concat PyString_Concat -#define PyBytes_ConcatAndDel PyString_ConcatAndDel -#define PyBytes_AsString PyString_AsString -#define PyBytes_GET_SIZE PyString_GET_SIZE -#define PyBytes_Size PyString_Size - -#define PyUString_Type PyString_Type -#define PyUString_Check PyString_Check -#define PyUStringObject PyStringObject -#define PyUString_FromString PyString_FromString -#define PyUString_FromStringAndSize PyString_FromStringAndSize -#define PyUString_FromFormat PyString_FromFormat -#define PyUString_Concat PyString_Concat -#define PyUString_ConcatAndDel PyString_ConcatAndDel -#define PyUString_GET_SIZE PyString_GET_SIZE -#define PyUString_Size PyString_Size -#define PyUString_InternFromString PyString_InternFromString -#define PyUString_Format PyString_Format - -#endif /* NPY_PY3K */ - - -static NPY_INLINE void -PyUnicode_ConcatAndDel(PyObject **left, PyObject *right) -{ - PyObject *newobj; - newobj = PyUnicode_Concat(*left, right); - Py_DECREF(*left); - Py_DECREF(right); - *left = newobj; -} - -static NPY_INLINE void -PyUnicode_Concat2(PyObject **left, PyObject *right) -{ - PyObject *newobj; - newobj = PyUnicode_Concat(*left, right); - Py_DECREF(*left); - *left = newobj; -} - -/* - * PyFile_* compatibility - */ -#if defined(NPY_PY3K) - -/* - * Get a FILE* handle to the file represented by the Python object - */ -static NPY_INLINE FILE* -npy_PyFile_Dup(PyObject *file, char *mode) -{ - int fd, fd2; - PyObject *ret, *os; - Py_ssize_t pos; - FILE *handle; - /* Flush first to ensure things end up in the file in the correct order */ - ret = PyObject_CallMethod(file, "flush", ""); - if (ret == NULL) { - return NULL; - } - Py_DECREF(ret); - fd = PyObject_AsFileDescriptor(file); - if (fd == -1) { - return NULL; - } - os = PyImport_ImportModule("os"); - if (os == NULL) { - return NULL; - } - ret = PyObject_CallMethod(os, "dup", "i", fd); - Py_DECREF(os); - if (ret == NULL) { - return NULL; - } - fd2 = PyNumber_AsSsize_t(ret, NULL); - Py_DECREF(ret); -#ifdef _WIN32 - handle = _fdopen(fd2, mode); -#else - handle = fdopen(fd2, mode); -#endif - if (handle == NULL) { - PyErr_SetString(PyExc_IOError, - "Getting a FILE* from a Python file object failed"); - } - ret = PyObject_CallMethod(file, "tell", ""); - if (ret == NULL) { - fclose(handle); - return NULL; - } - pos = PyNumber_AsSsize_t(ret, PyExc_OverflowError); - Py_DECREF(ret); - if (PyErr_Occurred()) { - fclose(handle); - return NULL; - } - npy_fseek(handle, pos, SEEK_SET); - return handle; -} - -/* - * Close the dup-ed file handle, and seek the Python one to the current position - */ -static NPY_INLINE int -npy_PyFile_DupClose(PyObject *file, FILE* handle) -{ - PyObject *ret; - Py_ssize_t position; - position = npy_ftell(handle); - fclose(handle); - - ret = PyObject_CallMethod(file, "seek", NPY_SSIZE_T_PYFMT "i", position, 0); - if (ret == NULL) { - return -1; - } - Py_DECREF(ret); - return 0; -} - -static NPY_INLINE int -npy_PyFile_Check(PyObject *file) -{ - int fd; - fd = PyObject_AsFileDescriptor(file); - if (fd == -1) { - PyErr_Clear(); - return 0; - } - return 1; -} - -#else - -#define npy_PyFile_Dup(file, mode) PyFile_AsFile(file) -#define npy_PyFile_DupClose(file, handle) (0) -#define npy_PyFile_Check PyFile_Check - -#endif - -static NPY_INLINE PyObject* -npy_PyFile_OpenFile(PyObject *filename, const char *mode) -{ - PyObject *open; - open = PyDict_GetItemString(PyEval_GetBuiltins(), "open"); - if (open == NULL) { - return NULL; - } - return PyObject_CallFunction(open, "Os", filename, mode); -} - -static NPY_INLINE int -npy_PyFile_CloseFile(PyObject *file) -{ - PyObject *ret; - - ret = PyObject_CallMethod(file, "close", NULL); - if (ret == NULL) { - return -1; - } - Py_DECREF(ret); - return 0; -} - -/* - * PyObject_Cmp - */ -#if defined(NPY_PY3K) -static NPY_INLINE int -PyObject_Cmp(PyObject *i1, PyObject *i2, int *cmp) -{ - int v; - v = PyObject_RichCompareBool(i1, i2, Py_LT); - if (v == 0) { - *cmp = -1; - return 1; - } - else if (v == -1) { - return -1; - } - - v = PyObject_RichCompareBool(i1, i2, Py_GT); - if (v == 0) { - *cmp = 1; - return 1; - } - else if (v == -1) { - return -1; - } - - v = PyObject_RichCompareBool(i1, i2, Py_EQ); - if (v == 0) { - *cmp = 0; - return 1; - } - else { - *cmp = 0; - return -1; - } -} -#endif - -/* - * PyCObject functions adapted to PyCapsules. - * - * The main job here is to get rid of the improved error handling - * of PyCapsules. It's a shame... - */ -#if PY_VERSION_HEX >= 0x03000000 - -static NPY_INLINE PyObject * -NpyCapsule_FromVoidPtr(void *ptr, void (*dtor)(PyObject *)) -{ - PyObject *ret = PyCapsule_New(ptr, NULL, dtor); - if (ret == NULL) { - PyErr_Clear(); - } - return ret; -} - -static NPY_INLINE PyObject * -NpyCapsule_FromVoidPtrAndDesc(void *ptr, void* context, void (*dtor)(PyObject *)) -{ - PyObject *ret = NpyCapsule_FromVoidPtr(ptr, dtor); - if (ret != NULL && PyCapsule_SetContext(ret, context) != 0) { - PyErr_Clear(); - Py_DECREF(ret); - ret = NULL; - } - return ret; -} - -static NPY_INLINE void * -NpyCapsule_AsVoidPtr(PyObject *obj) -{ - void *ret = PyCapsule_GetPointer(obj, NULL); - if (ret == NULL) { - PyErr_Clear(); - } - return ret; -} - -static NPY_INLINE void * -NpyCapsule_GetDesc(PyObject *obj) -{ - return PyCapsule_GetContext(obj); -} - -static NPY_INLINE int -NpyCapsule_Check(PyObject *ptr) -{ - return PyCapsule_CheckExact(ptr); -} - -static NPY_INLINE void -simple_capsule_dtor(PyObject *cap) -{ - PyArray_free(PyCapsule_GetPointer(cap, NULL)); -} - -#else - -static NPY_INLINE PyObject * -NpyCapsule_FromVoidPtr(void *ptr, void (*dtor)(void *)) -{ - return PyCObject_FromVoidPtr(ptr, dtor); -} - -static NPY_INLINE PyObject * -NpyCapsule_FromVoidPtrAndDesc(void *ptr, void* context, - void (*dtor)(void *, void *)) -{ - return PyCObject_FromVoidPtrAndDesc(ptr, context, dtor); -} - -static NPY_INLINE void * -NpyCapsule_AsVoidPtr(PyObject *ptr) -{ - return PyCObject_AsVoidPtr(ptr); -} - -static NPY_INLINE void * -NpyCapsule_GetDesc(PyObject *obj) -{ - return PyCObject_GetDesc(obj); -} - -static NPY_INLINE int -NpyCapsule_Check(PyObject *ptr) -{ - return PyCObject_Check(ptr); -} - -static NPY_INLINE void -simple_capsule_dtor(void *ptr) -{ - PyArray_free(ptr); -} - -#endif - -/* - * Hash value compatibility. - * As of Python 3.2 hash values are of type Py_hash_t. - * Previous versions use C long. - */ -#if PY_VERSION_HEX < 0x03020000 -typedef long npy_hash_t; -#define NPY_SIZEOF_HASH_T NPY_SIZEOF_LONG -#else -typedef Py_hash_t npy_hash_t; -#define NPY_SIZEOF_HASH_T NPY_SIZEOF_INTP -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _NPY_3KCOMPAT_H_ */ diff --git a/include/numpy/npy_common.h b/include/numpy/npy_common.h deleted file mode 100644 index 7fca7e220..000000000 --- a/include/numpy/npy_common.h +++ /dev/null @@ -1,930 +0,0 @@ -#ifndef _NPY_COMMON_H_ -#define _NPY_COMMON_H_ - -/* numpconfig.h is auto-generated */ -#include "numpyconfig.h" - -#if defined(_MSC_VER) - #define NPY_INLINE __inline -#elif defined(__GNUC__) - #if defined(__STRICT_ANSI__) - #define NPY_INLINE __inline__ - #else - #define NPY_INLINE inline - #endif -#else - #define NPY_INLINE -#endif - -/* Enable 64 bit file position support on win-amd64. Ticket #1660 */ -#if defined(_MSC_VER) && defined(_WIN64) && (_MSC_VER > 1400) - #define npy_fseek _fseeki64 - #define npy_ftell _ftelli64 -#else - #define npy_fseek fseek - #define npy_ftell ftell -#endif - -/* enums for detected endianness */ -enum { - NPY_CPU_UNKNOWN_ENDIAN, - NPY_CPU_LITTLE, - NPY_CPU_BIG -}; - -/* - * This is to typedef npy_intp to the appropriate pointer size for - * this platform. Py_intptr_t, Py_uintptr_t are defined in pyport.h. - */ -typedef Py_intptr_t npy_intp; -typedef Py_uintptr_t npy_uintp; -#define NPY_SIZEOF_CHAR 1 -#define NPY_SIZEOF_BYTE 1 -#define NPY_SIZEOF_INTP NPY_SIZEOF_PY_INTPTR_T -#define NPY_SIZEOF_UINTP NPY_SIZEOF_PY_INTPTR_T -#define NPY_SIZEOF_CFLOAT NPY_SIZEOF_COMPLEX_FLOAT -#define NPY_SIZEOF_CDOUBLE NPY_SIZEOF_COMPLEX_DOUBLE -#define NPY_SIZEOF_CLONGDOUBLE NPY_SIZEOF_COMPLEX_LONGDOUBLE - -#ifdef constchar -#undef constchar -#endif - -#if (PY_VERSION_HEX < 0x02050000) - #ifndef PY_SSIZE_T_MIN - typedef int Py_ssize_t; - #define PY_SSIZE_T_MAX INT_MAX - #define PY_SSIZE_T_MIN INT_MIN - #endif -#define NPY_SSIZE_T_PYFMT "i" -#define constchar const char -#else -#define NPY_SSIZE_T_PYFMT "n" -#define constchar char -#endif - -/* NPY_INTP_FMT Note: - * Unlike the other NPY_*_FMT macros which are used with - * PyOS_snprintf, NPY_INTP_FMT is used with PyErr_Format and - * PyString_Format. These functions use different formatting - * codes which are portably specified according to the Python - * documentation. See ticket #1795. - * - * On Windows x64, the LONGLONG formatter should be used, but - * in Python 2.6 the %lld formatter is not supported. In this - * case we work around the problem by using the %zd formatter. - */ -#if NPY_SIZEOF_PY_INTPTR_T == NPY_SIZEOF_INT - #define NPY_INTP NPY_INT - #define NPY_UINTP NPY_UINT - #define PyIntpArrType_Type PyIntArrType_Type - #define PyUIntpArrType_Type PyUIntArrType_Type - #define NPY_MAX_INTP NPY_MAX_INT - #define NPY_MIN_INTP NPY_MIN_INT - #define NPY_MAX_UINTP NPY_MAX_UINT - #define NPY_INTP_FMT "d" -#elif NPY_SIZEOF_PY_INTPTR_T == NPY_SIZEOF_LONG - #define NPY_INTP NPY_LONG - #define NPY_UINTP NPY_ULONG - #define PyIntpArrType_Type PyLongArrType_Type - #define PyUIntpArrType_Type PyULongArrType_Type - #define NPY_MAX_INTP NPY_MAX_LONG - #define NPY_MIN_INTP NPY_MIN_LONG - #define NPY_MAX_UINTP NPY_MAX_ULONG - #define NPY_INTP_FMT "ld" -#elif defined(PY_LONG_LONG) && (NPY_SIZEOF_PY_INTPTR_T == NPY_SIZEOF_LONGLONG) - #define NPY_INTP NPY_LONGLONG - #define NPY_UINTP NPY_ULONGLONG - #define PyIntpArrType_Type PyLongLongArrType_Type - #define PyUIntpArrType_Type PyULongLongArrType_Type - #define NPY_MAX_INTP NPY_MAX_LONGLONG - #define NPY_MIN_INTP NPY_MIN_LONGLONG - #define NPY_MAX_UINTP NPY_MAX_ULONGLONG - #if (PY_VERSION_HEX >= 0x02070000) - #define NPY_INTP_FMT "lld" - #else - #define NPY_INTP_FMT "zd" - #endif -#endif - -/* - * We can only use C99 formats for npy_int_p if it is the same as - * intp_t, hence the condition on HAVE_UNITPTR_T - */ -#if (NPY_USE_C99_FORMATS) == 1 \ - && (defined HAVE_UINTPTR_T) \ - && (defined HAVE_INTTYPES_H) - #include - #undef NPY_INTP_FMT - #define NPY_INTP_FMT PRIdPTR -#endif - - -/* - * Some platforms don't define bool, long long, or long double. - * Handle that here. - */ -#define NPY_BYTE_FMT "hhd" -#define NPY_UBYTE_FMT "hhu" -#define NPY_SHORT_FMT "hd" -#define NPY_USHORT_FMT "hu" -#define NPY_INT_FMT "d" -#define NPY_UINT_FMT "u" -#define NPY_LONG_FMT "ld" -#define NPY_ULONG_FMT "lu" -#define NPY_HALF_FMT "g" -#define NPY_FLOAT_FMT "g" -#define NPY_DOUBLE_FMT "g" - - -#ifdef PY_LONG_LONG -typedef PY_LONG_LONG npy_longlong; -typedef unsigned PY_LONG_LONG npy_ulonglong; -# ifdef _MSC_VER -# define NPY_LONGLONG_FMT "I64d" -# define NPY_ULONGLONG_FMT "I64u" -# elif defined(__APPLE__) || defined(__FreeBSD__) -/* "%Ld" only parses 4 bytes -- "L" is floating modifier on MacOS X/BSD */ -# define NPY_LONGLONG_FMT "lld" -# define NPY_ULONGLONG_FMT "llu" -/* - another possible variant -- *quad_t works on *BSD, but is deprecated: - #define LONGLONG_FMT "qd" - #define ULONGLONG_FMT "qu" -*/ -# else -# define NPY_LONGLONG_FMT "Ld" -# define NPY_ULONGLONG_FMT "Lu" -# endif -# ifdef _MSC_VER -# define NPY_LONGLONG_SUFFIX(x) (x##i64) -# define NPY_ULONGLONG_SUFFIX(x) (x##Ui64) -# else -# define NPY_LONGLONG_SUFFIX(x) (x##LL) -# define NPY_ULONGLONG_SUFFIX(x) (x##ULL) -# endif -#else -typedef long npy_longlong; -typedef unsigned long npy_ulonglong; -# define NPY_LONGLONG_SUFFIX(x) (x##L) -# define NPY_ULONGLONG_SUFFIX(x) (x##UL) -#endif - - -typedef unsigned char npy_bool; -#define NPY_FALSE 0 -#define NPY_TRUE 1 - - -#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE - typedef double npy_longdouble; - #define NPY_LONGDOUBLE_FMT "g" -#else - typedef long double npy_longdouble; - #define NPY_LONGDOUBLE_FMT "Lg" -#endif - -#ifndef Py_USING_UNICODE -#error Must use Python with unicode enabled. -#endif - - -typedef signed char npy_byte; -typedef unsigned char npy_ubyte; -typedef unsigned short npy_ushort; -typedef unsigned int npy_uint; -typedef unsigned long npy_ulong; - -/* These are for completeness */ -typedef char npy_char; -typedef short npy_short; -typedef int npy_int; -typedef long npy_long; -typedef float npy_float; -typedef double npy_double; - -/* - * Disabling C99 complex usage: a lot of C code in numpy/scipy rely on being - * able to do .real/.imag. Will have to convert code first. - */ -#if 0 -#if defined(NPY_USE_C99_COMPLEX) && defined(NPY_HAVE_COMPLEX_DOUBLE) -typedef complex npy_cdouble; -#else -typedef struct { double real, imag; } npy_cdouble; -#endif - -#if defined(NPY_USE_C99_COMPLEX) && defined(NPY_HAVE_COMPLEX_FLOAT) -typedef complex float npy_cfloat; -#else -typedef struct { float real, imag; } npy_cfloat; -#endif - -#if defined(NPY_USE_C99_COMPLEX) && defined(NPY_HAVE_COMPLEX_LONG_DOUBLE) -typedef complex long double npy_clongdouble; -#else -typedef struct {npy_longdouble real, imag;} npy_clongdouble; -#endif -#endif -#if NPY_SIZEOF_COMPLEX_DOUBLE != 2 * NPY_SIZEOF_DOUBLE -#error npy_cdouble definition is not compatible with C99 complex definition ! \ - Please contact Numpy maintainers and give detailed information about your \ - compiler and platform -#endif -typedef struct { double real, imag; } npy_cdouble; - -#if NPY_SIZEOF_COMPLEX_FLOAT != 2 * NPY_SIZEOF_FLOAT -#error npy_cfloat definition is not compatible with C99 complex definition ! \ - Please contact Numpy maintainers and give detailed information about your \ - compiler and platform -#endif -typedef struct { float real, imag; } npy_cfloat; - -#if NPY_SIZEOF_COMPLEX_LONGDOUBLE != 2 * NPY_SIZEOF_LONGDOUBLE -#error npy_clongdouble definition is not compatible with C99 complex definition ! \ - Please contact Numpy maintainers and give detailed information about your \ - compiler and platform -#endif -typedef struct { npy_longdouble real, imag; } npy_clongdouble; - -/* - * numarray-style bit-width typedefs - */ -#define NPY_MAX_INT8 127 -#define NPY_MIN_INT8 -128 -#define NPY_MAX_UINT8 255 -#define NPY_MAX_INT16 32767 -#define NPY_MIN_INT16 -32768 -#define NPY_MAX_UINT16 65535 -#define NPY_MAX_INT32 2147483647 -#define NPY_MIN_INT32 (-NPY_MAX_INT32 - 1) -#define NPY_MAX_UINT32 4294967295U -#define NPY_MAX_INT64 NPY_LONGLONG_SUFFIX(9223372036854775807) -#define NPY_MIN_INT64 (-NPY_MAX_INT64 - NPY_LONGLONG_SUFFIX(1)) -#define NPY_MAX_UINT64 NPY_ULONGLONG_SUFFIX(18446744073709551615) -#define NPY_MAX_INT128 NPY_LONGLONG_SUFFIX(85070591730234615865843651857942052864) -#define NPY_MIN_INT128 (-NPY_MAX_INT128 - NPY_LONGLONG_SUFFIX(1)) -#define NPY_MAX_UINT128 NPY_ULONGLONG_SUFFIX(170141183460469231731687303715884105728) -#define NPY_MAX_INT256 NPY_LONGLONG_SUFFIX(57896044618658097711785492504343953926634992332820282019728792003956564819967) -#define NPY_MIN_INT256 (-NPY_MAX_INT256 - NPY_LONGLONG_SUFFIX(1)) -#define NPY_MAX_UINT256 NPY_ULONGLONG_SUFFIX(115792089237316195423570985008687907853269984665640564039457584007913129639935) -#define NPY_MIN_DATETIME NPY_MIN_INT64 -#define NPY_MAX_DATETIME NPY_MAX_INT64 -#define NPY_MIN_TIMEDELTA NPY_MIN_INT64 -#define NPY_MAX_TIMEDELTA NPY_MAX_INT64 - - /* Need to find the number of bits for each type and - make definitions accordingly. - - C states that sizeof(char) == 1 by definition - - So, just using the sizeof keyword won't help. - - It also looks like Python itself uses sizeof(char) quite a - bit, which by definition should be 1 all the time. - - Idea: Make Use of CHAR_BIT which should tell us how many - BITS per CHARACTER - */ - - /* Include platform definitions -- These are in the C89/90 standard */ -#include -#define NPY_MAX_BYTE SCHAR_MAX -#define NPY_MIN_BYTE SCHAR_MIN -#define NPY_MAX_UBYTE UCHAR_MAX -#define NPY_MAX_SHORT SHRT_MAX -#define NPY_MIN_SHORT SHRT_MIN -#define NPY_MAX_USHORT USHRT_MAX -#define NPY_MAX_INT INT_MAX -#ifndef INT_MIN -#define INT_MIN (-INT_MAX - 1) -#endif -#define NPY_MIN_INT INT_MIN -#define NPY_MAX_UINT UINT_MAX -#define NPY_MAX_LONG LONG_MAX -#define NPY_MIN_LONG LONG_MIN -#define NPY_MAX_ULONG ULONG_MAX - -#define NPY_SIZEOF_HALF 2 -#define NPY_SIZEOF_DATETIME 8 -#define NPY_SIZEOF_TIMEDELTA 8 - -#define NPY_BITSOF_BOOL (sizeof(npy_bool) * CHAR_BIT) -#define NPY_BITSOF_CHAR CHAR_BIT -#define NPY_BITSOF_BYTE (NPY_SIZEOF_BYTE * CHAR_BIT) -#define NPY_BITSOF_SHORT (NPY_SIZEOF_SHORT * CHAR_BIT) -#define NPY_BITSOF_INT (NPY_SIZEOF_INT * CHAR_BIT) -#define NPY_BITSOF_LONG (NPY_SIZEOF_LONG * CHAR_BIT) -#define NPY_BITSOF_LONGLONG (NPY_SIZEOF_LONGLONG * CHAR_BIT) -#define NPY_BITSOF_INTP (NPY_SIZEOF_INTP * CHAR_BIT) -#define NPY_BITSOF_HALF (NPY_SIZEOF_HALF * CHAR_BIT) -#define NPY_BITSOF_FLOAT (NPY_SIZEOF_FLOAT * CHAR_BIT) -#define NPY_BITSOF_DOUBLE (NPY_SIZEOF_DOUBLE * CHAR_BIT) -#define NPY_BITSOF_LONGDOUBLE (NPY_SIZEOF_LONGDOUBLE * CHAR_BIT) -#define NPY_BITSOF_CFLOAT (NPY_SIZEOF_CFLOAT * CHAR_BIT) -#define NPY_BITSOF_CDOUBLE (NPY_SIZEOF_CDOUBLE * CHAR_BIT) -#define NPY_BITSOF_CLONGDOUBLE (NPY_SIZEOF_CLONGDOUBLE * CHAR_BIT) -#define NPY_BITSOF_DATETIME (NPY_SIZEOF_DATETIME * CHAR_BIT) -#define NPY_BITSOF_TIMEDELTA (NPY_SIZEOF_TIMEDELTA * CHAR_BIT) - -#if NPY_BITSOF_LONG == 8 -#define NPY_INT8 NPY_LONG -#define NPY_UINT8 NPY_ULONG - typedef long npy_int8; - typedef unsigned long npy_uint8; -#define PyInt8ScalarObject PyLongScalarObject -#define PyInt8ArrType_Type PyLongArrType_Type -#define PyUInt8ScalarObject PyULongScalarObject -#define PyUInt8ArrType_Type PyULongArrType_Type -#define NPY_INT8_FMT NPY_LONG_FMT -#define NPY_UINT8_FMT NPY_ULONG_FMT -#elif NPY_BITSOF_LONG == 16 -#define NPY_INT16 NPY_LONG -#define NPY_UINT16 NPY_ULONG - typedef long npy_int16; - typedef unsigned long npy_uint16; -#define PyInt16ScalarObject PyLongScalarObject -#define PyInt16ArrType_Type PyLongArrType_Type -#define PyUInt16ScalarObject PyULongScalarObject -#define PyUInt16ArrType_Type PyULongArrType_Type -#define NPY_INT16_FMT NPY_LONG_FMT -#define NPY_UINT16_FMT NPY_ULONG_FMT -#elif NPY_BITSOF_LONG == 32 -#define NPY_INT32 NPY_LONG -#define NPY_UINT32 NPY_ULONG - typedef long npy_int32; - typedef unsigned long npy_uint32; - typedef unsigned long npy_ucs4; -#define PyInt32ScalarObject PyLongScalarObject -#define PyInt32ArrType_Type PyLongArrType_Type -#define PyUInt32ScalarObject PyULongScalarObject -#define PyUInt32ArrType_Type PyULongArrType_Type -#define NPY_INT32_FMT NPY_LONG_FMT -#define NPY_UINT32_FMT NPY_ULONG_FMT -#elif NPY_BITSOF_LONG == 64 -#define NPY_INT64 NPY_LONG -#define NPY_UINT64 NPY_ULONG - typedef long npy_int64; - typedef unsigned long npy_uint64; -#define PyInt64ScalarObject PyLongScalarObject -#define PyInt64ArrType_Type PyLongArrType_Type -#define PyUInt64ScalarObject PyULongScalarObject -#define PyUInt64ArrType_Type PyULongArrType_Type -#define NPY_INT64_FMT NPY_LONG_FMT -#define NPY_UINT64_FMT NPY_ULONG_FMT -#define MyPyLong_FromInt64 PyLong_FromLong -#define MyPyLong_AsInt64 PyLong_AsLong -#elif NPY_BITSOF_LONG == 128 -#define NPY_INT128 NPY_LONG -#define NPY_UINT128 NPY_ULONG - typedef long npy_int128; - typedef unsigned long npy_uint128; -#define PyInt128ScalarObject PyLongScalarObject -#define PyInt128ArrType_Type PyLongArrType_Type -#define PyUInt128ScalarObject PyULongScalarObject -#define PyUInt128ArrType_Type PyULongArrType_Type -#define NPY_INT128_FMT NPY_LONG_FMT -#define NPY_UINT128_FMT NPY_ULONG_FMT -#endif - -#if NPY_BITSOF_LONGLONG == 8 -# ifndef NPY_INT8 -# define NPY_INT8 NPY_LONGLONG -# define NPY_UINT8 NPY_ULONGLONG - typedef npy_longlong npy_int8; - typedef npy_ulonglong npy_uint8; -# define PyInt8ScalarObject PyLongLongScalarObject -# define PyInt8ArrType_Type PyLongLongArrType_Type -# define PyUInt8ScalarObject PyULongLongScalarObject -# define PyUInt8ArrType_Type PyULongLongArrType_Type -#define NPY_INT8_FMT NPY_LONGLONG_FMT -#define NPY_UINT8_FMT NPY_ULONGLONG_FMT -# endif -# define NPY_MAX_LONGLONG NPY_MAX_INT8 -# define NPY_MIN_LONGLONG NPY_MIN_INT8 -# define NPY_MAX_ULONGLONG NPY_MAX_UINT8 -#elif NPY_BITSOF_LONGLONG == 16 -# ifndef NPY_INT16 -# define NPY_INT16 NPY_LONGLONG -# define NPY_UINT16 NPY_ULONGLONG - typedef npy_longlong npy_int16; - typedef npy_ulonglong npy_uint16; -# define PyInt16ScalarObject PyLongLongScalarObject -# define PyInt16ArrType_Type PyLongLongArrType_Type -# define PyUInt16ScalarObject PyULongLongScalarObject -# define PyUInt16ArrType_Type PyULongLongArrType_Type -#define NPY_INT16_FMT NPY_LONGLONG_FMT -#define NPY_UINT16_FMT NPY_ULONGLONG_FMT -# endif -# define NPY_MAX_LONGLONG NPY_MAX_INT16 -# define NPY_MIN_LONGLONG NPY_MIN_INT16 -# define NPY_MAX_ULONGLONG NPY_MAX_UINT16 -#elif NPY_BITSOF_LONGLONG == 32 -# ifndef NPY_INT32 -# define NPY_INT32 NPY_LONGLONG -# define NPY_UINT32 NPY_ULONGLONG - typedef npy_longlong npy_int32; - typedef npy_ulonglong npy_uint32; - typedef npy_ulonglong npy_ucs4; -# define PyInt32ScalarObject PyLongLongScalarObject -# define PyInt32ArrType_Type PyLongLongArrType_Type -# define PyUInt32ScalarObject PyULongLongScalarObject -# define PyUInt32ArrType_Type PyULongLongArrType_Type -#define NPY_INT32_FMT NPY_LONGLONG_FMT -#define NPY_UINT32_FMT NPY_ULONGLONG_FMT -# endif -# define NPY_MAX_LONGLONG NPY_MAX_INT32 -# define NPY_MIN_LONGLONG NPY_MIN_INT32 -# define NPY_MAX_ULONGLONG NPY_MAX_UINT32 -#elif NPY_BITSOF_LONGLONG == 64 -# ifndef NPY_INT64 -# define NPY_INT64 NPY_LONGLONG -# define NPY_UINT64 NPY_ULONGLONG - typedef npy_longlong npy_int64; - typedef npy_ulonglong npy_uint64; -# define PyInt64ScalarObject PyLongLongScalarObject -# define PyInt64ArrType_Type PyLongLongArrType_Type -# define PyUInt64ScalarObject PyULongLongScalarObject -# define PyUInt64ArrType_Type PyULongLongArrType_Type -#define NPY_INT64_FMT NPY_LONGLONG_FMT -#define NPY_UINT64_FMT NPY_ULONGLONG_FMT -# define MyPyLong_FromInt64 PyLong_FromLongLong -# define MyPyLong_AsInt64 PyLong_AsLongLong -# endif -# define NPY_MAX_LONGLONG NPY_MAX_INT64 -# define NPY_MIN_LONGLONG NPY_MIN_INT64 -# define NPY_MAX_ULONGLONG NPY_MAX_UINT64 -#elif NPY_BITSOF_LONGLONG == 128 -# ifndef NPY_INT128 -# define NPY_INT128 NPY_LONGLONG -# define NPY_UINT128 NPY_ULONGLONG - typedef npy_longlong npy_int128; - typedef npy_ulonglong npy_uint128; -# define PyInt128ScalarObject PyLongLongScalarObject -# define PyInt128ArrType_Type PyLongLongArrType_Type -# define PyUInt128ScalarObject PyULongLongScalarObject -# define PyUInt128ArrType_Type PyULongLongArrType_Type -#define NPY_INT128_FMT NPY_LONGLONG_FMT -#define NPY_UINT128_FMT NPY_ULONGLONG_FMT -# endif -# define NPY_MAX_LONGLONG NPY_MAX_INT128 -# define NPY_MIN_LONGLONG NPY_MIN_INT128 -# define NPY_MAX_ULONGLONG NPY_MAX_UINT128 -#elif NPY_BITSOF_LONGLONG == 256 -# define NPY_INT256 NPY_LONGLONG -# define NPY_UINT256 NPY_ULONGLONG - typedef npy_longlong npy_int256; - typedef npy_ulonglong npy_uint256; -# define PyInt256ScalarObject PyLongLongScalarObject -# define PyInt256ArrType_Type PyLongLongArrType_Type -# define PyUInt256ScalarObject PyULongLongScalarObject -# define PyUInt256ArrType_Type PyULongLongArrType_Type -#define NPY_INT256_FMT NPY_LONGLONG_FMT -#define NPY_UINT256_FMT NPY_ULONGLONG_FMT -# define NPY_MAX_LONGLONG NPY_MAX_INT256 -# define NPY_MIN_LONGLONG NPY_MIN_INT256 -# define NPY_MAX_ULONGLONG NPY_MAX_UINT256 -#endif - -#if NPY_BITSOF_INT == 8 -#ifndef NPY_INT8 -#define NPY_INT8 NPY_INT -#define NPY_UINT8 NPY_UINT - typedef int npy_int8; - typedef unsigned int npy_uint8; -# define PyInt8ScalarObject PyIntScalarObject -# define PyInt8ArrType_Type PyIntArrType_Type -# define PyUInt8ScalarObject PyUIntScalarObject -# define PyUInt8ArrType_Type PyUIntArrType_Type -#define NPY_INT8_FMT NPY_INT_FMT -#define NPY_UINT8_FMT NPY_UINT_FMT -#endif -#elif NPY_BITSOF_INT == 16 -#ifndef NPY_INT16 -#define NPY_INT16 NPY_INT -#define NPY_UINT16 NPY_UINT - typedef int npy_int16; - typedef unsigned int npy_uint16; -# define PyInt16ScalarObject PyIntScalarObject -# define PyInt16ArrType_Type PyIntArrType_Type -# define PyUInt16ScalarObject PyIntUScalarObject -# define PyUInt16ArrType_Type PyIntUArrType_Type -#define NPY_INT16_FMT NPY_INT_FMT -#define NPY_UINT16_FMT NPY_UINT_FMT -#endif -#elif NPY_BITSOF_INT == 32 -#ifndef NPY_INT32 -#define NPY_INT32 NPY_INT -#define NPY_UINT32 NPY_UINT - typedef int npy_int32; - typedef unsigned int npy_uint32; - typedef unsigned int npy_ucs4; -# define PyInt32ScalarObject PyIntScalarObject -# define PyInt32ArrType_Type PyIntArrType_Type -# define PyUInt32ScalarObject PyUIntScalarObject -# define PyUInt32ArrType_Type PyUIntArrType_Type -#define NPY_INT32_FMT NPY_INT_FMT -#define NPY_UINT32_FMT NPY_UINT_FMT -#endif -#elif NPY_BITSOF_INT == 64 -#ifndef NPY_INT64 -#define NPY_INT64 NPY_INT -#define NPY_UINT64 NPY_UINT - typedef int npy_int64; - typedef unsigned int npy_uint64; -# define PyInt64ScalarObject PyIntScalarObject -# define PyInt64ArrType_Type PyIntArrType_Type -# define PyUInt64ScalarObject PyUIntScalarObject -# define PyUInt64ArrType_Type PyUIntArrType_Type -#define NPY_INT64_FMT NPY_INT_FMT -#define NPY_UINT64_FMT NPY_UINT_FMT -# define MyPyLong_FromInt64 PyLong_FromLong -# define MyPyLong_AsInt64 PyLong_AsLong -#endif -#elif NPY_BITSOF_INT == 128 -#ifndef NPY_INT128 -#define NPY_INT128 NPY_INT -#define NPY_UINT128 NPY_UINT - typedef int npy_int128; - typedef unsigned int npy_uint128; -# define PyInt128ScalarObject PyIntScalarObject -# define PyInt128ArrType_Type PyIntArrType_Type -# define PyUInt128ScalarObject PyUIntScalarObject -# define PyUInt128ArrType_Type PyUIntArrType_Type -#define NPY_INT128_FMT NPY_INT_FMT -#define NPY_UINT128_FMT NPY_UINT_FMT -#endif -#endif - -#if NPY_BITSOF_SHORT == 8 -#ifndef NPY_INT8 -#define NPY_INT8 NPY_SHORT -#define NPY_UINT8 NPY_USHORT - typedef short npy_int8; - typedef unsigned short npy_uint8; -# define PyInt8ScalarObject PyShortScalarObject -# define PyInt8ArrType_Type PyShortArrType_Type -# define PyUInt8ScalarObject PyUShortScalarObject -# define PyUInt8ArrType_Type PyUShortArrType_Type -#define NPY_INT8_FMT NPY_SHORT_FMT -#define NPY_UINT8_FMT NPY_USHORT_FMT -#endif -#elif NPY_BITSOF_SHORT == 16 -#ifndef NPY_INT16 -#define NPY_INT16 NPY_SHORT -#define NPY_UINT16 NPY_USHORT - typedef short npy_int16; - typedef unsigned short npy_uint16; -# define PyInt16ScalarObject PyShortScalarObject -# define PyInt16ArrType_Type PyShortArrType_Type -# define PyUInt16ScalarObject PyUShortScalarObject -# define PyUInt16ArrType_Type PyUShortArrType_Type -#define NPY_INT16_FMT NPY_SHORT_FMT -#define NPY_UINT16_FMT NPY_USHORT_FMT -#endif -#elif NPY_BITSOF_SHORT == 32 -#ifndef NPY_INT32 -#define NPY_INT32 NPY_SHORT -#define NPY_UINT32 NPY_USHORT - typedef short npy_int32; - typedef unsigned short npy_uint32; - typedef unsigned short npy_ucs4; -# define PyInt32ScalarObject PyShortScalarObject -# define PyInt32ArrType_Type PyShortArrType_Type -# define PyUInt32ScalarObject PyUShortScalarObject -# define PyUInt32ArrType_Type PyUShortArrType_Type -#define NPY_INT32_FMT NPY_SHORT_FMT -#define NPY_UINT32_FMT NPY_USHORT_FMT -#endif -#elif NPY_BITSOF_SHORT == 64 -#ifndef NPY_INT64 -#define NPY_INT64 NPY_SHORT -#define NPY_UINT64 NPY_USHORT - typedef short npy_int64; - typedef unsigned short npy_uint64; -# define PyInt64ScalarObject PyShortScalarObject -# define PyInt64ArrType_Type PyShortArrType_Type -# define PyUInt64ScalarObject PyUShortScalarObject -# define PyUInt64ArrType_Type PyUShortArrType_Type -#define NPY_INT64_FMT NPY_SHORT_FMT -#define NPY_UINT64_FMT NPY_USHORT_FMT -# define MyPyLong_FromInt64 PyLong_FromLong -# define MyPyLong_AsInt64 PyLong_AsLong -#endif -#elif NPY_BITSOF_SHORT == 128 -#ifndef NPY_INT128 -#define NPY_INT128 NPY_SHORT -#define NPY_UINT128 NPY_USHORT - typedef short npy_int128; - typedef unsigned short npy_uint128; -# define PyInt128ScalarObject PyShortScalarObject -# define PyInt128ArrType_Type PyShortArrType_Type -# define PyUInt128ScalarObject PyUShortScalarObject -# define PyUInt128ArrType_Type PyUShortArrType_Type -#define NPY_INT128_FMT NPY_SHORT_FMT -#define NPY_UINT128_FMT NPY_USHORT_FMT -#endif -#endif - - -#if NPY_BITSOF_CHAR == 8 -#ifndef NPY_INT8 -#define NPY_INT8 NPY_BYTE -#define NPY_UINT8 NPY_UBYTE - typedef signed char npy_int8; - typedef unsigned char npy_uint8; -# define PyInt8ScalarObject PyByteScalarObject -# define PyInt8ArrType_Type PyByteArrType_Type -# define PyUInt8ScalarObject PyUByteScalarObject -# define PyUInt8ArrType_Type PyUByteArrType_Type -#define NPY_INT8_FMT NPY_BYTE_FMT -#define NPY_UINT8_FMT NPY_UBYTE_FMT -#endif -#elif NPY_BITSOF_CHAR == 16 -#ifndef NPY_INT16 -#define NPY_INT16 NPY_BYTE -#define NPY_UINT16 NPY_UBYTE - typedef signed char npy_int16; - typedef unsigned char npy_uint16; -# define PyInt16ScalarObject PyByteScalarObject -# define PyInt16ArrType_Type PyByteArrType_Type -# define PyUInt16ScalarObject PyUByteScalarObject -# define PyUInt16ArrType_Type PyUByteArrType_Type -#define NPY_INT16_FMT NPY_BYTE_FMT -#define NPY_UINT16_FMT NPY_UBYTE_FMT -#endif -#elif NPY_BITSOF_CHAR == 32 -#ifndef NPY_INT32 -#define NPY_INT32 NPY_BYTE -#define NPY_UINT32 NPY_UBYTE - typedef signed char npy_int32; - typedef unsigned char npy_uint32; - typedef unsigned char npy_ucs4; -# define PyInt32ScalarObject PyByteScalarObject -# define PyInt32ArrType_Type PyByteArrType_Type -# define PyUInt32ScalarObject PyUByteScalarObject -# define PyUInt32ArrType_Type PyUByteArrType_Type -#define NPY_INT32_FMT NPY_BYTE_FMT -#define NPY_UINT32_FMT NPY_UBYTE_FMT -#endif -#elif NPY_BITSOF_CHAR == 64 -#ifndef NPY_INT64 -#define NPY_INT64 NPY_BYTE -#define NPY_UINT64 NPY_UBYTE - typedef signed char npy_int64; - typedef unsigned char npy_uint64; -# define PyInt64ScalarObject PyByteScalarObject -# define PyInt64ArrType_Type PyByteArrType_Type -# define PyUInt64ScalarObject PyUByteScalarObject -# define PyUInt64ArrType_Type PyUByteArrType_Type -#define NPY_INT64_FMT NPY_BYTE_FMT -#define NPY_UINT64_FMT NPY_UBYTE_FMT -# define MyPyLong_FromInt64 PyLong_FromLong -# define MyPyLong_AsInt64 PyLong_AsLong -#endif -#elif NPY_BITSOF_CHAR == 128 -#ifndef NPY_INT128 -#define NPY_INT128 NPY_BYTE -#define NPY_UINT128 NPY_UBYTE - typedef signed char npy_int128; - typedef unsigned char npy_uint128; -# define PyInt128ScalarObject PyByteScalarObject -# define PyInt128ArrType_Type PyByteArrType_Type -# define PyUInt128ScalarObject PyUByteScalarObject -# define PyUInt128ArrType_Type PyUByteArrType_Type -#define NPY_INT128_FMT NPY_BYTE_FMT -#define NPY_UINT128_FMT NPY_UBYTE_FMT -#endif -#endif - - - -#if NPY_BITSOF_DOUBLE == 32 -#ifndef NPY_FLOAT32 -#define NPY_FLOAT32 NPY_DOUBLE -#define NPY_COMPLEX64 NPY_CDOUBLE - typedef double npy_float32; - typedef npy_cdouble npy_complex64; -# define PyFloat32ScalarObject PyDoubleScalarObject -# define PyComplex64ScalarObject PyCDoubleScalarObject -# define PyFloat32ArrType_Type PyDoubleArrType_Type -# define PyComplex64ArrType_Type PyCDoubleArrType_Type -#define NPY_FLOAT32_FMT NPY_DOUBLE_FMT -#define NPY_COMPLEX64_FMT NPY_CDOUBLE_FMT -#endif -#elif NPY_BITSOF_DOUBLE == 64 -#ifndef NPY_FLOAT64 -#define NPY_FLOAT64 NPY_DOUBLE -#define NPY_COMPLEX128 NPY_CDOUBLE - typedef double npy_float64; - typedef npy_cdouble npy_complex128; -# define PyFloat64ScalarObject PyDoubleScalarObject -# define PyComplex128ScalarObject PyCDoubleScalarObject -# define PyFloat64ArrType_Type PyDoubleArrType_Type -# define PyComplex128ArrType_Type PyCDoubleArrType_Type -#define NPY_FLOAT64_FMT NPY_DOUBLE_FMT -#define NPY_COMPLEX128_FMT NPY_CDOUBLE_FMT -#endif -#elif NPY_BITSOF_DOUBLE == 80 -#ifndef NPY_FLOAT80 -#define NPY_FLOAT80 NPY_DOUBLE -#define NPY_COMPLEX160 NPY_CDOUBLE - typedef double npy_float80; - typedef npy_cdouble npy_complex160; -# define PyFloat80ScalarObject PyDoubleScalarObject -# define PyComplex160ScalarObject PyCDoubleScalarObject -# define PyFloat80ArrType_Type PyDoubleArrType_Type -# define PyComplex160ArrType_Type PyCDoubleArrType_Type -#define NPY_FLOAT80_FMT NPY_DOUBLE_FMT -#define NPY_COMPLEX160_FMT NPY_CDOUBLE_FMT -#endif -#elif NPY_BITSOF_DOUBLE == 96 -#ifndef NPY_FLOAT96 -#define NPY_FLOAT96 NPY_DOUBLE -#define NPY_COMPLEX192 NPY_CDOUBLE - typedef double npy_float96; - typedef npy_cdouble npy_complex192; -# define PyFloat96ScalarObject PyDoubleScalarObject -# define PyComplex192ScalarObject PyCDoubleScalarObject -# define PyFloat96ArrType_Type PyDoubleArrType_Type -# define PyComplex192ArrType_Type PyCDoubleArrType_Type -#define NPY_FLOAT96_FMT NPY_DOUBLE_FMT -#define NPY_COMPLEX192_FMT NPY_CDOUBLE_FMT -#endif -#elif NPY_BITSOF_DOUBLE == 128 -#ifndef NPY_FLOAT128 -#define NPY_FLOAT128 NPY_DOUBLE -#define NPY_COMPLEX256 NPY_CDOUBLE - typedef double npy_float128; - typedef npy_cdouble npy_complex256; -# define PyFloat128ScalarObject PyDoubleScalarObject -# define PyComplex256ScalarObject PyCDoubleScalarObject -# define PyFloat128ArrType_Type PyDoubleArrType_Type -# define PyComplex256ArrType_Type PyCDoubleArrType_Type -#define NPY_FLOAT128_FMT NPY_DOUBLE_FMT -#define NPY_COMPLEX256_FMT NPY_CDOUBLE_FMT -#endif -#endif - - - -#if NPY_BITSOF_FLOAT == 32 -#ifndef NPY_FLOAT32 -#define NPY_FLOAT32 NPY_FLOAT -#define NPY_COMPLEX64 NPY_CFLOAT - typedef float npy_float32; - typedef npy_cfloat npy_complex64; -# define PyFloat32ScalarObject PyFloatScalarObject -# define PyComplex64ScalarObject PyCFloatScalarObject -# define PyFloat32ArrType_Type PyFloatArrType_Type -# define PyComplex64ArrType_Type PyCFloatArrType_Type -#define NPY_FLOAT32_FMT NPY_FLOAT_FMT -#define NPY_COMPLEX64_FMT NPY_CFLOAT_FMT -#endif -#elif NPY_BITSOF_FLOAT == 64 -#ifndef NPY_FLOAT64 -#define NPY_FLOAT64 NPY_FLOAT -#define NPY_COMPLEX128 NPY_CFLOAT - typedef float npy_float64; - typedef npy_cfloat npy_complex128; -# define PyFloat64ScalarObject PyFloatScalarObject -# define PyComplex128ScalarObject PyCFloatScalarObject -# define PyFloat64ArrType_Type PyFloatArrType_Type -# define PyComplex128ArrType_Type PyCFloatArrType_Type -#define NPY_FLOAT64_FMT NPY_FLOAT_FMT -#define NPY_COMPLEX128_FMT NPY_CFLOAT_FMT -#endif -#elif NPY_BITSOF_FLOAT == 80 -#ifndef NPY_FLOAT80 -#define NPY_FLOAT80 NPY_FLOAT -#define NPY_COMPLEX160 NPY_CFLOAT - typedef float npy_float80; - typedef npy_cfloat npy_complex160; -# define PyFloat80ScalarObject PyFloatScalarObject -# define PyComplex160ScalarObject PyCFloatScalarObject -# define PyFloat80ArrType_Type PyFloatArrType_Type -# define PyComplex160ArrType_Type PyCFloatArrType_Type -#define NPY_FLOAT80_FMT NPY_FLOAT_FMT -#define NPY_COMPLEX160_FMT NPY_CFLOAT_FMT -#endif -#elif NPY_BITSOF_FLOAT == 96 -#ifndef NPY_FLOAT96 -#define NPY_FLOAT96 NPY_FLOAT -#define NPY_COMPLEX192 NPY_CFLOAT - typedef float npy_float96; - typedef npy_cfloat npy_complex192; -# define PyFloat96ScalarObject PyFloatScalarObject -# define PyComplex192ScalarObject PyCFloatScalarObject -# define PyFloat96ArrType_Type PyFloatArrType_Type -# define PyComplex192ArrType_Type PyCFloatArrType_Type -#define NPY_FLOAT96_FMT NPY_FLOAT_FMT -#define NPY_COMPLEX192_FMT NPY_CFLOAT_FMT -#endif -#elif NPY_BITSOF_FLOAT == 128 -#ifndef NPY_FLOAT128 -#define NPY_FLOAT128 NPY_FLOAT -#define NPY_COMPLEX256 NPY_CFLOAT - typedef float npy_float128; - typedef npy_cfloat npy_complex256; -# define PyFloat128ScalarObject PyFloatScalarObject -# define PyComplex256ScalarObject PyCFloatScalarObject -# define PyFloat128ArrType_Type PyFloatArrType_Type -# define PyComplex256ArrType_Type PyCFloatArrType_Type -#define NPY_FLOAT128_FMT NPY_FLOAT_FMT -#define NPY_COMPLEX256_FMT NPY_CFLOAT_FMT -#endif -#endif - -/* half/float16 isn't a floating-point type in C */ -#define NPY_FLOAT16 NPY_HALF -typedef npy_uint16 npy_half; -typedef npy_half npy_float16; - -#if NPY_BITSOF_LONGDOUBLE == 32 -#ifndef NPY_FLOAT32 -#define NPY_FLOAT32 NPY_LONGDOUBLE -#define NPY_COMPLEX64 NPY_CLONGDOUBLE - typedef npy_longdouble npy_float32; - typedef npy_clongdouble npy_complex64; -# define PyFloat32ScalarObject PyLongDoubleScalarObject -# define PyComplex64ScalarObject PyCLongDoubleScalarObject -# define PyFloat32ArrType_Type PyLongDoubleArrType_Type -# define PyComplex64ArrType_Type PyCLongDoubleArrType_Type -#define NPY_FLOAT32_FMT NPY_LONGDOUBLE_FMT -#define NPY_COMPLEX64_FMT NPY_CLONGDOUBLE_FMT -#endif -#elif NPY_BITSOF_LONGDOUBLE == 64 -#ifndef NPY_FLOAT64 -#define NPY_FLOAT64 NPY_LONGDOUBLE -#define NPY_COMPLEX128 NPY_CLONGDOUBLE - typedef npy_longdouble npy_float64; - typedef npy_clongdouble npy_complex128; -# define PyFloat64ScalarObject PyLongDoubleScalarObject -# define PyComplex128ScalarObject PyCLongDoubleScalarObject -# define PyFloat64ArrType_Type PyLongDoubleArrType_Type -# define PyComplex128ArrType_Type PyCLongDoubleArrType_Type -#define NPY_FLOAT64_FMT NPY_LONGDOUBLE_FMT -#define NPY_COMPLEX128_FMT NPY_CLONGDOUBLE_FMT -#endif -#elif NPY_BITSOF_LONGDOUBLE == 80 -#ifndef NPY_FLOAT80 -#define NPY_FLOAT80 NPY_LONGDOUBLE -#define NPY_COMPLEX160 NPY_CLONGDOUBLE - typedef npy_longdouble npy_float80; - typedef npy_clongdouble npy_complex160; -# define PyFloat80ScalarObject PyLongDoubleScalarObject -# define PyComplex160ScalarObject PyCLongDoubleScalarObject -# define PyFloat80ArrType_Type PyLongDoubleArrType_Type -# define PyComplex160ArrType_Type PyCLongDoubleArrType_Type -#define NPY_FLOAT80_FMT NPY_LONGDOUBLE_FMT -#define NPY_COMPLEX160_FMT NPY_CLONGDOUBLE_FMT -#endif -#elif NPY_BITSOF_LONGDOUBLE == 96 -#ifndef NPY_FLOAT96 -#define NPY_FLOAT96 NPY_LONGDOUBLE -#define NPY_COMPLEX192 NPY_CLONGDOUBLE - typedef npy_longdouble npy_float96; - typedef npy_clongdouble npy_complex192; -# define PyFloat96ScalarObject PyLongDoubleScalarObject -# define PyComplex192ScalarObject PyCLongDoubleScalarObject -# define PyFloat96ArrType_Type PyLongDoubleArrType_Type -# define PyComplex192ArrType_Type PyCLongDoubleArrType_Type -#define NPY_FLOAT96_FMT NPY_LONGDOUBLE_FMT -#define NPY_COMPLEX192_FMT NPY_CLONGDOUBLE_FMT -#endif -#elif NPY_BITSOF_LONGDOUBLE == 128 -#ifndef NPY_FLOAT128 -#define NPY_FLOAT128 NPY_LONGDOUBLE -#define NPY_COMPLEX256 NPY_CLONGDOUBLE - typedef npy_longdouble npy_float128; - typedef npy_clongdouble npy_complex256; -# define PyFloat128ScalarObject PyLongDoubleScalarObject -# define PyComplex256ScalarObject PyCLongDoubleScalarObject -# define PyFloat128ArrType_Type PyLongDoubleArrType_Type -# define PyComplex256ArrType_Type PyCLongDoubleArrType_Type -#define NPY_FLOAT128_FMT NPY_LONGDOUBLE_FMT -#define NPY_COMPLEX256_FMT NPY_CLONGDOUBLE_FMT -#endif -#elif NPY_BITSOF_LONGDOUBLE == 256 -#define NPY_FLOAT256 NPY_LONGDOUBLE -#define NPY_COMPLEX512 NPY_CLONGDOUBLE - typedef npy_longdouble npy_float256; - typedef npy_clongdouble npy_complex512; -# define PyFloat256ScalarObject PyLongDoubleScalarObject -# define PyComplex512ScalarObject PyCLongDoubleScalarObject -# define PyFloat256ArrType_Type PyLongDoubleArrType_Type -# define PyComplex512ArrType_Type PyCLongDoubleArrType_Type -#define NPY_FLOAT256_FMT NPY_LONGDOUBLE_FMT -#define NPY_COMPLEX512_FMT NPY_CLONGDOUBLE_FMT -#endif - -/* datetime typedefs */ -typedef npy_int64 npy_timedelta; -typedef npy_int64 npy_datetime; -#define NPY_DATETIME_FMT NPY_INT64_FMT -#define NPY_TIMEDELTA_FMT NPY_INT64_FMT - -/* End of typedefs for numarray style bit-width names */ - -#endif - diff --git a/include/numpy/npy_cpu.h b/include/numpy/npy_cpu.h deleted file mode 100644 index 9707a7adf..000000000 --- a/include/numpy/npy_cpu.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * This set (target) cpu specific macros: - * - Possible values: - * NPY_CPU_X86 - * NPY_CPU_AMD64 - * NPY_CPU_PPC - * NPY_CPU_PPC64 - * NPY_CPU_SPARC - * NPY_CPU_S390 - * NPY_CPU_IA64 - * NPY_CPU_HPPA - * NPY_CPU_ALPHA - * NPY_CPU_ARMEL - * NPY_CPU_ARMEB - * NPY_CPU_SH_LE - * NPY_CPU_SH_BE - */ -#ifndef _NPY_CPUARCH_H_ -#define _NPY_CPUARCH_H_ - -#include "numpyconfig.h" - -#if defined( __i386__ ) || defined(i386) || defined(_M_IX86) - /* - * __i386__ is defined by gcc and Intel compiler on Linux, - * _M_IX86 by VS compiler, - * i386 by Sun compilers on opensolaris at least - */ - #define NPY_CPU_X86 -#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64) - /* - * both __x86_64__ and __amd64__ are defined by gcc - * __x86_64 defined by sun compiler on opensolaris at least - * _M_AMD64 defined by MS compiler - */ - #define NPY_CPU_AMD64 -#elif defined(__ppc__) || defined(__powerpc__) || defined(_ARCH_PPC) - /* - * __ppc__ is defined by gcc, I remember having seen __powerpc__ once, - * but can't find it ATM - * _ARCH_PPC is used by at least gcc on AIX - */ - #define NPY_CPU_PPC -#elif defined(__ppc64__) - #define NPY_CPU_PPC64 -#elif defined(__sparc__) || defined(__sparc) - /* __sparc__ is defined by gcc and Forte (e.g. Sun) compilers */ - #define NPY_CPU_SPARC -#elif defined(__s390__) - #define NPY_CPU_S390 -#elif defined(__ia64) - #define NPY_CPU_IA64 -#elif defined(__hppa) - #define NPY_CPU_HPPA -#elif defined(__alpha__) - #define NPY_CPU_ALPHA -#elif defined(__arm__) && defined(__ARMEL__) - #define NPY_CPU_ARMEL -#elif defined(__arm__) && defined(__ARMEB__) - #define NPY_CPU_ARMEB -#elif defined(__sh__) && defined(__LITTLE_ENDIAN__) - #define NPY_CPU_SH_LE -#elif defined(__sh__) && defined(__BIG_ENDIAN__) - #define NPY_CPU_SH_BE -#elif defined(__MIPSEL__) - #define NPY_CPU_MIPSEL -#elif defined(__MIPSEB__) - #define NPY_CPU_MIPSEB -#elif defined(__aarch64__) - #define NPY_CPU_AARCH64 -#else - #error Unknown CPU, please report this to numpy maintainers with \ - information about your platform (OS, CPU and compiler) -#endif - -/* - This "white-lists" the architectures that we know don't require - pointer alignment. We white-list, since the memcpy version will - work everywhere, whereas assignment will only work where pointer - dereferencing doesn't require alignment. - - TODO: There may be more architectures we can white list. -*/ -#if defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64) - #define NPY_COPY_PYOBJECT_PTR(dst, src) (*((PyObject **)(dst)) = *((PyObject **)(src))) -#else - #if NPY_SIZEOF_PY_INTPTR_T == 4 - #define NPY_COPY_PYOBJECT_PTR(dst, src) \ - ((char*)(dst))[0] = ((char*)(src))[0]; \ - ((char*)(dst))[1] = ((char*)(src))[1]; \ - ((char*)(dst))[2] = ((char*)(src))[2]; \ - ((char*)(dst))[3] = ((char*)(src))[3]; - #elif NPY_SIZEOF_PY_INTPTR_T == 8 - #define NPY_COPY_PYOBJECT_PTR(dst, src) \ - ((char*)(dst))[0] = ((char*)(src))[0]; \ - ((char*)(dst))[1] = ((char*)(src))[1]; \ - ((char*)(dst))[2] = ((char*)(src))[2]; \ - ((char*)(dst))[3] = ((char*)(src))[3]; \ - ((char*)(dst))[4] = ((char*)(src))[4]; \ - ((char*)(dst))[5] = ((char*)(src))[5]; \ - ((char*)(dst))[6] = ((char*)(src))[6]; \ - ((char*)(dst))[7] = ((char*)(src))[7]; - #else - #error Unknown architecture, please report this to numpy maintainers with \ - information about your platform (OS, CPU and compiler) - #endif -#endif - -#endif diff --git a/include/numpy/npy_deprecated_api.h b/include/numpy/npy_deprecated_api.h deleted file mode 100644 index c27b4a4c9..000000000 --- a/include/numpy/npy_deprecated_api.h +++ /dev/null @@ -1,129 +0,0 @@ -#ifndef _NPY_DEPRECATED_API_H -#define _NPY_DEPRECATED_API_H - -#if defined(_WIN32) -#define _WARN___STR2__(x) #x -#define _WARN___STR1__(x) _WARN___STR2__(x) -#define _WARN___LOC__ __FILE__ "(" _WARN___STR1__(__LINE__) ") : Warning Msg: " -#pragma message(_WARN___LOC__"Using deprecated NumPy API, disable it by " \ - "#defining NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION") -#elif defined(__GNUC__) -#warning "Using deprecated NumPy API, disable it by #defining NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" -#endif -/* TODO: How to do this warning message for other compilers? */ - -/* - * This header exists to collect all dangerous/deprecated NumPy API. - * - * This is an attempt to remove bad API, the proliferation of macros, - * and namespace pollution currently produced by the NumPy headers. - */ - -#if defined(NPY_NO_DEPRECATED_API) -#error Should never include npy_deprecated_api directly. -#endif - -/* These array flags are deprecated as of NumPy 1.7 */ -#define NPY_CONTIGUOUS NPY_ARRAY_C_CONTIGUOUS -#define NPY_FORTRAN NPY_ARRAY_F_CONTIGUOUS - -/* - * The consistent NPY_ARRAY_* names which don't pollute the NPY_* - * namespace were added in NumPy 1.7. - * - * These versions of the carray flags are deprecated, but - * probably should only be removed after two releases instead of one. - */ -#define NPY_C_CONTIGUOUS NPY_ARRAY_C_CONTIGUOUS -#define NPY_F_CONTIGUOUS NPY_ARRAY_F_CONTIGUOUS -#define NPY_OWNDATA NPY_ARRAY_OWNDATA -#define NPY_FORCECAST NPY_ARRAY_FORCECAST -#define NPY_ENSURECOPY NPY_ARRAY_ENSURECOPY -#define NPY_ENSUREARRAY NPY_ARRAY_ENSUREARRAY -#define NPY_ELEMENTSTRIDES NPY_ARRAY_ELEMENTSTRIDES -#define NPY_ALIGNED NPY_ARRAY_ALIGNED -#define NPY_NOTSWAPPED NPY_ARRAY_NOTSWAPPED -#define NPY_WRITEABLE NPY_ARRAY_WRITEABLE -#define NPY_UPDATEIFCOPY NPY_ARRAY_UPDATEIFCOPY -#define NPY_BEHAVED NPY_ARRAY_BEHAVED -#define NPY_BEHAVED_NS NPY_ARRAY_BEHAVED_NS -#define NPY_CARRAY NPY_ARRAY_CARRAY -#define NPY_CARRAY_RO NPY_ARRAY_CARRAY_RO -#define NPY_FARRAY NPY_ARRAY_FARRAY -#define NPY_FARRAY_RO NPY_ARRAY_FARRAY_RO -#define NPY_DEFAULT NPY_ARRAY_DEFAULT -#define NPY_IN_ARRAY NPY_ARRAY_IN_ARRAY -#define NPY_OUT_ARRAY NPY_ARRAY_OUT_ARRAY -#define NPY_INOUT_ARRAY NPY_ARRAY_INOUT_ARRAY -#define NPY_IN_FARRAY NPY_ARRAY_IN_FARRAY -#define NPY_OUT_FARRAY NPY_ARRAY_OUT_FARRAY -#define NPY_INOUT_FARRAY NPY_ARRAY_INOUT_FARRAY -#define NPY_UPDATE_ALL NPY_ARRAY_UPDATE_ALL - -/* This way of accessing the default type is deprecated as of NumPy 1.7 */ -#define PyArray_DEFAULT NPY_DEFAULT_TYPE - -/* These DATETIME bits aren't used internally */ -#if PY_VERSION_HEX >= 0x03000000 -#define PyDataType_GetDatetimeMetaData(descr) \ - ((descr->metadata == NULL) ? NULL : \ - ((PyArray_DatetimeMetaData *)(PyCapsule_GetPointer( \ - PyDict_GetItemString( \ - descr->metadata, NPY_METADATA_DTSTR), NULL)))) -#else -#define PyDataType_GetDatetimeMetaData(descr) \ - ((descr->metadata == NULL) ? NULL : \ - ((PyArray_DatetimeMetaData *)(PyCObject_AsVoidPtr( \ - PyDict_GetItemString(descr->metadata, NPY_METADATA_DTSTR))))) -#endif - -/* - * Deprecated as of NumPy 1.7, this kind of shortcut doesn't - * belong in the public API. - */ -#define NPY_AO PyArrayObject - -/* - * Deprecated as of NumPy 1.7, an all-lowercase macro doesn't - * belong in the public API. - */ -#define fortran fortran_ - -/* - * Deprecated as of NumPy 1.7, as it is a namespace-polluting - * macro. - */ -#define FORTRAN_IF PyArray_FORTRAN_IF - -/* Deprecated as of NumPy 1.7, datetime64 uses c_metadata instead */ -#define NPY_METADATA_DTSTR "__timeunit__" - -/* - * Deprecated as of NumPy 1.7. - * The reasoning: - * - These are for datetime, but there's no datetime "namespace". - * - They just turn NPY_STR_ into "", which is just - * making something simple be indirected. - */ -#define NPY_STR_Y "Y" -#define NPY_STR_M "M" -#define NPY_STR_W "W" -#define NPY_STR_D "D" -#define NPY_STR_h "h" -#define NPY_STR_m "m" -#define NPY_STR_s "s" -#define NPY_STR_ms "ms" -#define NPY_STR_us "us" -#define NPY_STR_ns "ns" -#define NPY_STR_ps "ps" -#define NPY_STR_fs "fs" -#define NPY_STR_as "as" - -/* - * The macros in old_defines.h are Deprecated as of NumPy 1.7 and will be - * removed in the next major release. - */ -#include "old_defines.h" - - -#endif diff --git a/include/numpy/npy_endian.h b/include/numpy/npy_endian.h deleted file mode 100644 index 4e3349ffe..000000000 --- a/include/numpy/npy_endian.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef _NPY_ENDIAN_H_ -#define _NPY_ENDIAN_H_ - -/* - * NPY_BYTE_ORDER is set to the same value as BYTE_ORDER set by glibc in - * endian.h - */ - -#ifdef NPY_HAVE_ENDIAN_H - /* Use endian.h if available */ - #include - - #define NPY_BYTE_ORDER __BYTE_ORDER - #define NPY_LITTLE_ENDIAN __LITTLE_ENDIAN - #define NPY_BIG_ENDIAN __BIG_ENDIAN -#else - /* Set endianness info using target CPU */ - #include "npy_cpu.h" - - #define NPY_LITTLE_ENDIAN 1234 - #define NPY_BIG_ENDIAN 4321 - - #if defined(NPY_CPU_X86) \ - || defined(NPY_CPU_AMD64) \ - || defined(NPY_CPU_IA64) \ - || defined(NPY_CPU_ALPHA) \ - || defined(NPY_CPU_ARMEL) \ - || defined(NPY_CPU_AARCH64) \ - || defined(NPY_CPU_SH_LE) \ - || defined(NPY_CPU_MIPSEL) - #define NPY_BYTE_ORDER NPY_LITTLE_ENDIAN - #elif defined(NPY_CPU_PPC) \ - || defined(NPY_CPU_SPARC) \ - || defined(NPY_CPU_S390) \ - || defined(NPY_CPU_HPPA) \ - || defined(NPY_CPU_PPC64) \ - || defined(NPY_CPU_ARMEB) \ - || defined(NPY_CPU_SH_BE) \ - || defined(NPY_CPU_MIPSEB) - #define NPY_BYTE_ORDER NPY_BIG_ENDIAN - #else - #error Unknown CPU: can not set endianness - #endif -#endif - -#endif diff --git a/include/numpy/npy_interrupt.h b/include/numpy/npy_interrupt.h deleted file mode 100644 index f71fd689e..000000000 --- a/include/numpy/npy_interrupt.h +++ /dev/null @@ -1,117 +0,0 @@ - -/* Signal handling: - -This header file defines macros that allow your code to handle -interrupts received during processing. Interrupts that -could reasonably be handled: - -SIGINT, SIGABRT, SIGALRM, SIGSEGV - -****Warning*************** - -Do not allow code that creates temporary memory or increases reference -counts of Python objects to be interrupted unless you handle it -differently. - -************************** - -The mechanism for handling interrupts is conceptually simple: - - - replace the signal handler with our own home-grown version - and store the old one. - - run the code to be interrupted -- if an interrupt occurs - the handler should basically just cause a return to the - calling function for finish work. - - restore the old signal handler - -Of course, every code that allows interrupts must account for -returning via the interrupt and handle clean-up correctly. But, -even still, the simple paradigm is complicated by at least three -factors. - - 1) platform portability (i.e. Microsoft says not to use longjmp - to return from signal handling. They have a __try and __except - extension to C instead but what about mingw?). - - 2) how to handle threads: apparently whether signals are delivered to - every thread of the process or the "invoking" thread is platform - dependent. --- we don't handle threads for now. - - 3) do we need to worry about re-entrance. For now, assume the - code will not call-back into itself. - -Ideas: - - 1) Start by implementing an approach that works on platforms that - can use setjmp and longjmp functionality and does nothing - on other platforms. - - 2) Ignore threads --- i.e. do not mix interrupt handling and threads - - 3) Add a default signal_handler function to the C-API but have the rest - use macros. - - -Simple Interface: - - -In your C-extension: around a block of code you want to be interruptable -with a SIGINT - -NPY_SIGINT_ON -[code] -NPY_SIGINT_OFF - -In order for this to work correctly, the -[code] block must not allocate any memory or alter the reference count of any -Python objects. In other words [code] must be interruptible so that continuation -after NPY_SIGINT_OFF will only be "missing some computations" - -Interrupt handling does not work well with threads. - -*/ - -/* Add signal handling macros - Make the global variable and signal handler part of the C-API -*/ - -#ifndef NPY_INTERRUPT_H -#define NPY_INTERRUPT_H - -#ifndef NPY_NO_SIGNAL - -#include -#include - -#ifndef sigsetjmp - -#define NPY_SIGSETJMP(arg1, arg2) setjmp(arg1) -#define NPY_SIGLONGJMP(arg1, arg2) longjmp(arg1, arg2) -#define NPY_SIGJMP_BUF jmp_buf - -#else - -#define NPY_SIGSETJMP(arg1, arg2) sigsetjmp(arg1, arg2) -#define NPY_SIGLONGJMP(arg1, arg2) siglongjmp(arg1, arg2) -#define NPY_SIGJMP_BUF sigjmp_buf - -#endif - -# define NPY_SIGINT_ON { \ - PyOS_sighandler_t _npy_sig_save; \ - _npy_sig_save = PyOS_setsig(SIGINT, _PyArray_SigintHandler); \ - if (NPY_SIGSETJMP(*((NPY_SIGJMP_BUF *)_PyArray_GetSigintBuf()), \ - 1) == 0) { \ - -# define NPY_SIGINT_OFF } \ - PyOS_setsig(SIGINT, _npy_sig_save); \ - } - -#else /* NPY_NO_SIGNAL */ - -#define NPY_SIGINT_ON -#define NPY_SIGINT_OFF - -#endif /* HAVE_SIGSETJMP */ - -#endif /* NPY_INTERRUPT_H */ diff --git a/include/numpy/npy_math.h b/include/numpy/npy_math.h deleted file mode 100644 index 7ae166e54..000000000 --- a/include/numpy/npy_math.h +++ /dev/null @@ -1,438 +0,0 @@ -#ifndef __NPY_MATH_C99_H_ -#define __NPY_MATH_C99_H_ - -#include -#ifdef __SUNPRO_CC -#include -#endif -#include - -/* - * NAN and INFINITY like macros (same behavior as glibc for NAN, same as C99 - * for INFINITY) - * - * XXX: I should test whether INFINITY and NAN are available on the platform - */ -NPY_INLINE static float __npy_inff(void) -{ - const union { npy_uint32 __i; float __f;} __bint = {0x7f800000UL}; - return __bint.__f; -} - -NPY_INLINE static float __npy_nanf(void) -{ - const union { npy_uint32 __i; float __f;} __bint = {0x7fc00000UL}; - return __bint.__f; -} - -NPY_INLINE static float __npy_pzerof(void) -{ - const union { npy_uint32 __i; float __f;} __bint = {0x00000000UL}; - return __bint.__f; -} - -NPY_INLINE static float __npy_nzerof(void) -{ - const union { npy_uint32 __i; float __f;} __bint = {0x80000000UL}; - return __bint.__f; -} - -#define NPY_INFINITYF __npy_inff() -#define NPY_NANF __npy_nanf() -#define NPY_PZEROF __npy_pzerof() -#define NPY_NZEROF __npy_nzerof() - -#define NPY_INFINITY ((npy_double)NPY_INFINITYF) -#define NPY_NAN ((npy_double)NPY_NANF) -#define NPY_PZERO ((npy_double)NPY_PZEROF) -#define NPY_NZERO ((npy_double)NPY_NZEROF) - -#define NPY_INFINITYL ((npy_longdouble)NPY_INFINITYF) -#define NPY_NANL ((npy_longdouble)NPY_NANF) -#define NPY_PZEROL ((npy_longdouble)NPY_PZEROF) -#define NPY_NZEROL ((npy_longdouble)NPY_NZEROF) - -/* - * Useful constants - */ -#define NPY_E 2.718281828459045235360287471352662498 /* e */ -#define NPY_LOG2E 1.442695040888963407359924681001892137 /* log_2 e */ -#define NPY_LOG10E 0.434294481903251827651128918916605082 /* log_10 e */ -#define NPY_LOGE2 0.693147180559945309417232121458176568 /* log_e 2 */ -#define NPY_LOGE10 2.302585092994045684017991454684364208 /* log_e 10 */ -#define NPY_PI 3.141592653589793238462643383279502884 /* pi */ -#define NPY_PI_2 1.570796326794896619231321691639751442 /* pi/2 */ -#define NPY_PI_4 0.785398163397448309615660845819875721 /* pi/4 */ -#define NPY_1_PI 0.318309886183790671537767526745028724 /* 1/pi */ -#define NPY_2_PI 0.636619772367581343075535053490057448 /* 2/pi */ -#define NPY_EULER 0.577215664901532860606512090082402431 /* Euler constant */ -#define NPY_SQRT2 1.414213562373095048801688724209698079 /* sqrt(2) */ -#define NPY_SQRT1_2 0.707106781186547524400844362104849039 /* 1/sqrt(2) */ - -#define NPY_Ef 2.718281828459045235360287471352662498F /* e */ -#define NPY_LOG2Ef 1.442695040888963407359924681001892137F /* log_2 e */ -#define NPY_LOG10Ef 0.434294481903251827651128918916605082F /* log_10 e */ -#define NPY_LOGE2f 0.693147180559945309417232121458176568F /* log_e 2 */ -#define NPY_LOGE10f 2.302585092994045684017991454684364208F /* log_e 10 */ -#define NPY_PIf 3.141592653589793238462643383279502884F /* pi */ -#define NPY_PI_2f 1.570796326794896619231321691639751442F /* pi/2 */ -#define NPY_PI_4f 0.785398163397448309615660845819875721F /* pi/4 */ -#define NPY_1_PIf 0.318309886183790671537767526745028724F /* 1/pi */ -#define NPY_2_PIf 0.636619772367581343075535053490057448F /* 2/pi */ -#define NPY_EULERf 0.577215664901532860606512090082402431F /* Euler constan*/ -#define NPY_SQRT2f 1.414213562373095048801688724209698079F /* sqrt(2) */ -#define NPY_SQRT1_2f 0.707106781186547524400844362104849039F /* 1/sqrt(2) */ - -#define NPY_El 2.718281828459045235360287471352662498L /* e */ -#define NPY_LOG2El 1.442695040888963407359924681001892137L /* log_2 e */ -#define NPY_LOG10El 0.434294481903251827651128918916605082L /* log_10 e */ -#define NPY_LOGE2l 0.693147180559945309417232121458176568L /* log_e 2 */ -#define NPY_LOGE10l 2.302585092994045684017991454684364208L /* log_e 10 */ -#define NPY_PIl 3.141592653589793238462643383279502884L /* pi */ -#define NPY_PI_2l 1.570796326794896619231321691639751442L /* pi/2 */ -#define NPY_PI_4l 0.785398163397448309615660845819875721L /* pi/4 */ -#define NPY_1_PIl 0.318309886183790671537767526745028724L /* 1/pi */ -#define NPY_2_PIl 0.636619772367581343075535053490057448L /* 2/pi */ -#define NPY_EULERl 0.577215664901532860606512090082402431L /* Euler constan*/ -#define NPY_SQRT2l 1.414213562373095048801688724209698079L /* sqrt(2) */ -#define NPY_SQRT1_2l 0.707106781186547524400844362104849039L /* 1/sqrt(2) */ - -/* - * C99 double math funcs - */ -double npy_sin(double x); -double npy_cos(double x); -double npy_tan(double x); -double npy_sinh(double x); -double npy_cosh(double x); -double npy_tanh(double x); - -double npy_asin(double x); -double npy_acos(double x); -double npy_atan(double x); -double npy_aexp(double x); -double npy_alog(double x); -double npy_asqrt(double x); -double npy_afabs(double x); - -double npy_log(double x); -double npy_log10(double x); -double npy_exp(double x); -double npy_sqrt(double x); - -double npy_fabs(double x); -double npy_ceil(double x); -double npy_fmod(double x, double y); -double npy_floor(double x); - -double npy_expm1(double x); -double npy_log1p(double x); -double npy_hypot(double x, double y); -double npy_acosh(double x); -double npy_asinh(double xx); -double npy_atanh(double x); -double npy_rint(double x); -double npy_trunc(double x); -double npy_exp2(double x); -double npy_log2(double x); - -double npy_atan2(double x, double y); -double npy_pow(double x, double y); -double npy_modf(double x, double* y); - -double npy_copysign(double x, double y); -double npy_nextafter(double x, double y); -double npy_spacing(double x); - -/* - * IEEE 754 fpu handling. Those are guaranteed to be macros - */ -#ifndef NPY_HAVE_DECL_ISNAN - #define npy_isnan(x) ((x) != (x)) -#else - #ifdef _MSC_VER - #define npy_isnan(x) _isnan((x)) - #else - #define npy_isnan(x) isnan((x)) - #endif -#endif - -#ifndef NPY_HAVE_DECL_ISFINITE - #ifdef _MSC_VER - #define npy_isfinite(x) _finite((x)) - #else - #define npy_isfinite(x) !npy_isnan((x) + (-x)) - #endif -#else - #define npy_isfinite(x) isfinite((x)) -#endif - -#ifndef NPY_HAVE_DECL_ISINF - #define npy_isinf(x) (!npy_isfinite(x) && !npy_isnan(x)) -#else - #ifdef _MSC_VER - #define npy_isinf(x) (!_finite((x)) && !_isnan((x))) - #else - #define npy_isinf(x) isinf((x)) - #endif -#endif - -#ifndef NPY_HAVE_DECL_SIGNBIT - int _npy_signbit_f(float x); - int _npy_signbit_d(double x); - int _npy_signbit_ld(long double x); - #define npy_signbit(x) \ - (sizeof (x) == sizeof (long double) ? _npy_signbit_ld (x) \ - : sizeof (x) == sizeof (double) ? _npy_signbit_d (x) \ - : _npy_signbit_f (x)) -#else - #define npy_signbit(x) signbit((x)) -#endif - -/* - * float C99 math functions - */ - -float npy_sinf(float x); -float npy_cosf(float x); -float npy_tanf(float x); -float npy_sinhf(float x); -float npy_coshf(float x); -float npy_tanhf(float x); -float npy_fabsf(float x); -float npy_floorf(float x); -float npy_ceilf(float x); -float npy_rintf(float x); -float npy_truncf(float x); -float npy_sqrtf(float x); -float npy_log10f(float x); -float npy_logf(float x); -float npy_expf(float x); -float npy_expm1f(float x); -float npy_asinf(float x); -float npy_acosf(float x); -float npy_atanf(float x); -float npy_asinhf(float x); -float npy_acoshf(float x); -float npy_atanhf(float x); -float npy_log1pf(float x); -float npy_exp2f(float x); -float npy_log2f(float x); - -float npy_atan2f(float x, float y); -float npy_hypotf(float x, float y); -float npy_powf(float x, float y); -float npy_fmodf(float x, float y); - -float npy_modff(float x, float* y); - -float npy_copysignf(float x, float y); -float npy_nextafterf(float x, float y); -float npy_spacingf(float x); - -/* - * float C99 math functions - */ - -npy_longdouble npy_sinl(npy_longdouble x); -npy_longdouble npy_cosl(npy_longdouble x); -npy_longdouble npy_tanl(npy_longdouble x); -npy_longdouble npy_sinhl(npy_longdouble x); -npy_longdouble npy_coshl(npy_longdouble x); -npy_longdouble npy_tanhl(npy_longdouble x); -npy_longdouble npy_fabsl(npy_longdouble x); -npy_longdouble npy_floorl(npy_longdouble x); -npy_longdouble npy_ceill(npy_longdouble x); -npy_longdouble npy_rintl(npy_longdouble x); -npy_longdouble npy_truncl(npy_longdouble x); -npy_longdouble npy_sqrtl(npy_longdouble x); -npy_longdouble npy_log10l(npy_longdouble x); -npy_longdouble npy_logl(npy_longdouble x); -npy_longdouble npy_expl(npy_longdouble x); -npy_longdouble npy_expm1l(npy_longdouble x); -npy_longdouble npy_asinl(npy_longdouble x); -npy_longdouble npy_acosl(npy_longdouble x); -npy_longdouble npy_atanl(npy_longdouble x); -npy_longdouble npy_asinhl(npy_longdouble x); -npy_longdouble npy_acoshl(npy_longdouble x); -npy_longdouble npy_atanhl(npy_longdouble x); -npy_longdouble npy_log1pl(npy_longdouble x); -npy_longdouble npy_exp2l(npy_longdouble x); -npy_longdouble npy_log2l(npy_longdouble x); - -npy_longdouble npy_atan2l(npy_longdouble x, npy_longdouble y); -npy_longdouble npy_hypotl(npy_longdouble x, npy_longdouble y); -npy_longdouble npy_powl(npy_longdouble x, npy_longdouble y); -npy_longdouble npy_fmodl(npy_longdouble x, npy_longdouble y); - -npy_longdouble npy_modfl(npy_longdouble x, npy_longdouble* y); - -npy_longdouble npy_copysignl(npy_longdouble x, npy_longdouble y); -npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y); -npy_longdouble npy_spacingl(npy_longdouble x); - -/* - * Non standard functions - */ -double npy_deg2rad(double x); -double npy_rad2deg(double x); -double npy_logaddexp(double x, double y); -double npy_logaddexp2(double x, double y); - -float npy_deg2radf(float x); -float npy_rad2degf(float x); -float npy_logaddexpf(float x, float y); -float npy_logaddexp2f(float x, float y); - -npy_longdouble npy_deg2radl(npy_longdouble x); -npy_longdouble npy_rad2degl(npy_longdouble x); -npy_longdouble npy_logaddexpl(npy_longdouble x, npy_longdouble y); -npy_longdouble npy_logaddexp2l(npy_longdouble x, npy_longdouble y); - -#define npy_degrees npy_rad2deg -#define npy_degreesf npy_rad2degf -#define npy_degreesl npy_rad2degl - -#define npy_radians npy_deg2rad -#define npy_radiansf npy_deg2radf -#define npy_radiansl npy_deg2radl - -/* - * Complex declarations - */ - -/* - * C99 specifies that complex numbers have the same representation as - * an array of two elements, where the first element is the real part - * and the second element is the imaginary part. - */ -#define __NPY_CPACK_IMP(x, y, type, ctype) \ - union { \ - ctype z; \ - type a[2]; \ - } z1;; \ - \ - z1.a[0] = (x); \ - z1.a[1] = (y); \ - \ - return z1.z; - -static NPY_INLINE npy_cdouble npy_cpack(double x, double y) -{ - __NPY_CPACK_IMP(x, y, double, npy_cdouble); -} - -static NPY_INLINE npy_cfloat npy_cpackf(float x, float y) -{ - __NPY_CPACK_IMP(x, y, float, npy_cfloat); -} - -static NPY_INLINE npy_clongdouble npy_cpackl(npy_longdouble x, npy_longdouble y) -{ - __NPY_CPACK_IMP(x, y, npy_longdouble, npy_clongdouble); -} -#undef __NPY_CPACK_IMP - -/* - * Same remark as above, but in the other direction: extract first/second - * member of complex number, assuming a C99-compatible representation - * - * Those are defineds as static inline, and such as a reasonable compiler would - * most likely compile this to one or two instructions (on CISC at least) - */ -#define __NPY_CEXTRACT_IMP(z, index, type, ctype) \ - union { \ - ctype z; \ - type a[2]; \ - } __z_repr; \ - __z_repr.z = z; \ - \ - return __z_repr.a[index]; - -static NPY_INLINE double npy_creal(npy_cdouble z) -{ - __NPY_CEXTRACT_IMP(z, 0, double, npy_cdouble); -} - -static NPY_INLINE double npy_cimag(npy_cdouble z) -{ - __NPY_CEXTRACT_IMP(z, 1, double, npy_cdouble); -} - -static NPY_INLINE float npy_crealf(npy_cfloat z) -{ - __NPY_CEXTRACT_IMP(z, 0, float, npy_cfloat); -} - -static NPY_INLINE float npy_cimagf(npy_cfloat z) -{ - __NPY_CEXTRACT_IMP(z, 1, float, npy_cfloat); -} - -static NPY_INLINE npy_longdouble npy_creall(npy_clongdouble z) -{ - __NPY_CEXTRACT_IMP(z, 0, npy_longdouble, npy_clongdouble); -} - -static NPY_INLINE npy_longdouble npy_cimagl(npy_clongdouble z) -{ - __NPY_CEXTRACT_IMP(z, 1, npy_longdouble, npy_clongdouble); -} -#undef __NPY_CEXTRACT_IMP - -/* - * Double precision complex functions - */ -double npy_cabs(npy_cdouble z); -double npy_carg(npy_cdouble z); - -npy_cdouble npy_cexp(npy_cdouble z); -npy_cdouble npy_clog(npy_cdouble z); -npy_cdouble npy_cpow(npy_cdouble x, npy_cdouble y); - -npy_cdouble npy_csqrt(npy_cdouble z); - -npy_cdouble npy_ccos(npy_cdouble z); -npy_cdouble npy_csin(npy_cdouble z); - -/* - * Single precision complex functions - */ -float npy_cabsf(npy_cfloat z); -float npy_cargf(npy_cfloat z); - -npy_cfloat npy_cexpf(npy_cfloat z); -npy_cfloat npy_clogf(npy_cfloat z); -npy_cfloat npy_cpowf(npy_cfloat x, npy_cfloat y); - -npy_cfloat npy_csqrtf(npy_cfloat z); - -npy_cfloat npy_ccosf(npy_cfloat z); -npy_cfloat npy_csinf(npy_cfloat z); - -/* - * Extended precision complex functions - */ -npy_longdouble npy_cabsl(npy_clongdouble z); -npy_longdouble npy_cargl(npy_clongdouble z); - -npy_clongdouble npy_cexpl(npy_clongdouble z); -npy_clongdouble npy_clogl(npy_clongdouble z); -npy_clongdouble npy_cpowl(npy_clongdouble x, npy_clongdouble y); - -npy_clongdouble npy_csqrtl(npy_clongdouble z); - -npy_clongdouble npy_ccosl(npy_clongdouble z); -npy_clongdouble npy_csinl(npy_clongdouble z); - -/* - * Functions that set the floating point error - * status word. - */ - -void npy_set_floatstatus_divbyzero(void); -void npy_set_floatstatus_overflow(void); -void npy_set_floatstatus_underflow(void); -void npy_set_floatstatus_invalid(void); - -#endif diff --git a/include/numpy/npy_no_deprecated_api.h b/include/numpy/npy_no_deprecated_api.h deleted file mode 100644 index 6183dc278..000000000 --- a/include/numpy/npy_no_deprecated_api.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * This include file is provided for inclusion in Cython *.pyd files where - * one would like to define the NPY_NO_DEPRECATED_API macro. It can be - * included by - * - * cdef extern from "npy_no_deprecated_api.h": pass - * - */ -#ifndef NPY_NO_DEPRECATED_API - -/* put this check here since there may be multiple includes in C extensions. */ -#if defined(NDARRAYTYPES_H) || defined(_NPY_DEPRECATED_API_H) || \ - defined(OLD_DEFINES_H) -#error "npy_no_deprecated_api.h" must be first among numpy includes. -#else -#define NPY_NO_DEPRECATED_API NPY_API_VERSION -#endif - -#endif diff --git a/include/numpy/npy_os.h b/include/numpy/npy_os.h deleted file mode 100644 index 9228c3916..000000000 --- a/include/numpy/npy_os.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _NPY_OS_H_ -#define _NPY_OS_H_ - -#if defined(linux) || defined(__linux) || defined(__linux__) - #define NPY_OS_LINUX -#elif defined(__FreeBSD__) || defined(__NetBSD__) || \ - defined(__OpenBSD__) || defined(__DragonFly__) - #define NPY_OS_BSD - #ifdef __FreeBSD__ - #define NPY_OS_FREEBSD - #elif defined(__NetBSD__) - #define NPY_OS_NETBSD - #elif defined(__OpenBSD__) - #define NPY_OS_OPENBSD - #elif defined(__DragonFly__) - #define NPY_OS_DRAGONFLY - #endif -#elif defined(sun) || defined(__sun) - #define NPY_OS_SOLARIS -#elif defined(__CYGWIN__) - #define NPY_OS_CYGWIN -#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32) - #define NPY_OS_WIN32 -#elif defined(__APPLE__) - #define NPY_OS_DARWIN -#else - #define NPY_OS_UNKNOWN -#endif - -#endif diff --git a/include/numpy/numpyconfig.h b/include/numpy/numpyconfig.h deleted file mode 100644 index 401d19fd7..000000000 --- a/include/numpy/numpyconfig.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _NPY_NUMPYCONFIG_H_ -#define _NPY_NUMPYCONFIG_H_ - -#include "_numpyconfig.h" - -/* - * On Mac OS X, because there is only one configuration stage for all the archs - * in universal builds, any macro which depends on the arch needs to be - * harcoded - */ -#ifdef __APPLE__ - #undef NPY_SIZEOF_LONG - #undef NPY_SIZEOF_PY_INTPTR_T - - #ifdef __LP64__ - #define NPY_SIZEOF_LONG 8 - #define NPY_SIZEOF_PY_INTPTR_T 8 - #else - #define NPY_SIZEOF_LONG 4 - #define NPY_SIZEOF_PY_INTPTR_T 4 - #endif -#endif - -/** - * To help with the NPY_NO_DEPRECATED_API macro, we include API version - * numbers for specific versions of NumPy. To exclude all API that was - * deprecated as of 1.7, add the following before #including any NumPy - * headers: - * #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION - */ -#define NPY_1_7_API_VERSION 0x00000007 - -#endif diff --git a/include/numpy/old_defines.h b/include/numpy/old_defines.h deleted file mode 100644 index abf81595a..000000000 --- a/include/numpy/old_defines.h +++ /dev/null @@ -1,187 +0,0 @@ -/* This header is deprecated as of NumPy 1.7 */ -#ifndef OLD_DEFINES_H -#define OLD_DEFINES_H - -#if defined(NPY_NO_DEPRECATED_API) && NPY_NO_DEPRECATED_API >= NPY_1_7_API_VERSION -#error The header "old_defines.h" is deprecated as of NumPy 1.7. -#endif - -#define NDARRAY_VERSION NPY_VERSION - -#define PyArray_MIN_BUFSIZE NPY_MIN_BUFSIZE -#define PyArray_MAX_BUFSIZE NPY_MAX_BUFSIZE -#define PyArray_BUFSIZE NPY_BUFSIZE - -#define PyArray_PRIORITY NPY_PRIORITY -#define PyArray_SUBTYPE_PRIORITY NPY_PRIORITY -#define PyArray_NUM_FLOATTYPE NPY_NUM_FLOATTYPE - -#define NPY_MAX PyArray_MAX -#define NPY_MIN PyArray_MIN - -#define PyArray_TYPES NPY_TYPES -#define PyArray_BOOL NPY_BOOL -#define PyArray_BYTE NPY_BYTE -#define PyArray_UBYTE NPY_UBYTE -#define PyArray_SHORT NPY_SHORT -#define PyArray_USHORT NPY_USHORT -#define PyArray_INT NPY_INT -#define PyArray_UINT NPY_UINT -#define PyArray_LONG NPY_LONG -#define PyArray_ULONG NPY_ULONG -#define PyArray_LONGLONG NPY_LONGLONG -#define PyArray_ULONGLONG NPY_ULONGLONG -#define PyArray_HALF NPY_HALF -#define PyArray_FLOAT NPY_FLOAT -#define PyArray_DOUBLE NPY_DOUBLE -#define PyArray_LONGDOUBLE NPY_LONGDOUBLE -#define PyArray_CFLOAT NPY_CFLOAT -#define PyArray_CDOUBLE NPY_CDOUBLE -#define PyArray_CLONGDOUBLE NPY_CLONGDOUBLE -#define PyArray_OBJECT NPY_OBJECT -#define PyArray_STRING NPY_STRING -#define PyArray_UNICODE NPY_UNICODE -#define PyArray_VOID NPY_VOID -#define PyArray_DATETIME NPY_DATETIME -#define PyArray_TIMEDELTA NPY_TIMEDELTA -#define PyArray_NTYPES NPY_NTYPES -#define PyArray_NOTYPE NPY_NOTYPE -#define PyArray_CHAR NPY_CHAR -#define PyArray_USERDEF NPY_USERDEF -#define PyArray_NUMUSERTYPES NPY_NUMUSERTYPES - -#define PyArray_INTP NPY_INTP -#define PyArray_UINTP NPY_UINTP - -#define PyArray_INT8 NPY_INT8 -#define PyArray_UINT8 NPY_UINT8 -#define PyArray_INT16 NPY_INT16 -#define PyArray_UINT16 NPY_UINT16 -#define PyArray_INT32 NPY_INT32 -#define PyArray_UINT32 NPY_UINT32 - -#ifdef NPY_INT64 -#define PyArray_INT64 NPY_INT64 -#define PyArray_UINT64 NPY_UINT64 -#endif - -#ifdef NPY_INT128 -#define PyArray_INT128 NPY_INT128 -#define PyArray_UINT128 NPY_UINT128 -#endif - -#ifdef NPY_FLOAT16 -#define PyArray_FLOAT16 NPY_FLOAT16 -#define PyArray_COMPLEX32 NPY_COMPLEX32 -#endif - -#ifdef NPY_FLOAT80 -#define PyArray_FLOAT80 NPY_FLOAT80 -#define PyArray_COMPLEX160 NPY_COMPLEX160 -#endif - -#ifdef NPY_FLOAT96 -#define PyArray_FLOAT96 NPY_FLOAT96 -#define PyArray_COMPLEX192 NPY_COMPLEX192 -#endif - -#ifdef NPY_FLOAT128 -#define PyArray_FLOAT128 NPY_FLOAT128 -#define PyArray_COMPLEX256 NPY_COMPLEX256 -#endif - -#define PyArray_FLOAT32 NPY_FLOAT32 -#define PyArray_COMPLEX64 NPY_COMPLEX64 -#define PyArray_FLOAT64 NPY_FLOAT64 -#define PyArray_COMPLEX128 NPY_COMPLEX128 - - -#define PyArray_TYPECHAR NPY_TYPECHAR -#define PyArray_BOOLLTR NPY_BOOLLTR -#define PyArray_BYTELTR NPY_BYTELTR -#define PyArray_UBYTELTR NPY_UBYTELTR -#define PyArray_SHORTLTR NPY_SHORTLTR -#define PyArray_USHORTLTR NPY_USHORTLTR -#define PyArray_INTLTR NPY_INTLTR -#define PyArray_UINTLTR NPY_UINTLTR -#define PyArray_LONGLTR NPY_LONGLTR -#define PyArray_ULONGLTR NPY_ULONGLTR -#define PyArray_LONGLONGLTR NPY_LONGLONGLTR -#define PyArray_ULONGLONGLTR NPY_ULONGLONGLTR -#define PyArray_HALFLTR NPY_HALFLTR -#define PyArray_FLOATLTR NPY_FLOATLTR -#define PyArray_DOUBLELTR NPY_DOUBLELTR -#define PyArray_LONGDOUBLELTR NPY_LONGDOUBLELTR -#define PyArray_CFLOATLTR NPY_CFLOATLTR -#define PyArray_CDOUBLELTR NPY_CDOUBLELTR -#define PyArray_CLONGDOUBLELTR NPY_CLONGDOUBLELTR -#define PyArray_OBJECTLTR NPY_OBJECTLTR -#define PyArray_STRINGLTR NPY_STRINGLTR -#define PyArray_STRINGLTR2 NPY_STRINGLTR2 -#define PyArray_UNICODELTR NPY_UNICODELTR -#define PyArray_VOIDLTR NPY_VOIDLTR -#define PyArray_DATETIMELTR NPY_DATETIMELTR -#define PyArray_TIMEDELTALTR NPY_TIMEDELTALTR -#define PyArray_CHARLTR NPY_CHARLTR -#define PyArray_INTPLTR NPY_INTPLTR -#define PyArray_UINTPLTR NPY_UINTPLTR -#define PyArray_GENBOOLLTR NPY_GENBOOLLTR -#define PyArray_SIGNEDLTR NPY_SIGNEDLTR -#define PyArray_UNSIGNEDLTR NPY_UNSIGNEDLTR -#define PyArray_FLOATINGLTR NPY_FLOATINGLTR -#define PyArray_COMPLEXLTR NPY_COMPLEXLTR - -#define PyArray_QUICKSORT NPY_QUICKSORT -#define PyArray_HEAPSORT NPY_HEAPSORT -#define PyArray_MERGESORT NPY_MERGESORT -#define PyArray_SORTKIND NPY_SORTKIND -#define PyArray_NSORTS NPY_NSORTS - -#define PyArray_NOSCALAR NPY_NOSCALAR -#define PyArray_BOOL_SCALAR NPY_BOOL_SCALAR -#define PyArray_INTPOS_SCALAR NPY_INTPOS_SCALAR -#define PyArray_INTNEG_SCALAR NPY_INTNEG_SCALAR -#define PyArray_FLOAT_SCALAR NPY_FLOAT_SCALAR -#define PyArray_COMPLEX_SCALAR NPY_COMPLEX_SCALAR -#define PyArray_OBJECT_SCALAR NPY_OBJECT_SCALAR -#define PyArray_SCALARKIND NPY_SCALARKIND -#define PyArray_NSCALARKINDS NPY_NSCALARKINDS - -#define PyArray_ANYORDER NPY_ANYORDER -#define PyArray_CORDER NPY_CORDER -#define PyArray_FORTRANORDER NPY_FORTRANORDER -#define PyArray_ORDER NPY_ORDER - -#define PyDescr_ISBOOL PyDataType_ISBOOL -#define PyDescr_ISUNSIGNED PyDataType_ISUNSIGNED -#define PyDescr_ISSIGNED PyDataType_ISSIGNED -#define PyDescr_ISINTEGER PyDataType_ISINTEGER -#define PyDescr_ISFLOAT PyDataType_ISFLOAT -#define PyDescr_ISNUMBER PyDataType_ISNUMBER -#define PyDescr_ISSTRING PyDataType_ISSTRING -#define PyDescr_ISCOMPLEX PyDataType_ISCOMPLEX -#define PyDescr_ISPYTHON PyDataType_ISPYTHON -#define PyDescr_ISFLEXIBLE PyDataType_ISFLEXIBLE -#define PyDescr_ISUSERDEF PyDataType_ISUSERDEF -#define PyDescr_ISEXTENDED PyDataType_ISEXTENDED -#define PyDescr_ISOBJECT PyDataType_ISOBJECT -#define PyDescr_HASFIELDS PyDataType_HASFIELDS - -#define PyArray_LITTLE NPY_LITTLE -#define PyArray_BIG NPY_BIG -#define PyArray_NATIVE NPY_NATIVE -#define PyArray_SWAP NPY_SWAP -#define PyArray_IGNORE NPY_IGNORE - -#define PyArray_NATBYTE NPY_NATBYTE -#define PyArray_OPPBYTE NPY_OPPBYTE - -#define PyArray_MAX_ELSIZE NPY_MAX_ELSIZE - -#define PyArray_USE_PYMEM NPY_USE_PYMEM - -#define PyArray_RemoveLargest PyArray_RemoveSmallest - -#define PyArray_UCS4 npy_ucs4 - -#endif diff --git a/include/numpy/oldnumeric.h b/include/numpy/oldnumeric.h deleted file mode 100644 index 748f06da3..000000000 --- a/include/numpy/oldnumeric.h +++ /dev/null @@ -1,23 +0,0 @@ -#include "arrayobject.h" - -#ifndef REFCOUNT -# define REFCOUNT NPY_REFCOUNT -# define MAX_ELSIZE 16 -#endif - -#define PyArray_UNSIGNED_TYPES -#define PyArray_SBYTE NPY_BYTE -#define PyArray_CopyArray PyArray_CopyInto -#define _PyArray_multiply_list PyArray_MultiplyIntList -#define PyArray_ISSPACESAVER(m) NPY_FALSE -#define PyScalarArray_Check PyArray_CheckScalar - -#define CONTIGUOUS NPY_CONTIGUOUS -#define OWN_DIMENSIONS 0 -#define OWN_STRIDES 0 -#define OWN_DATA NPY_OWNDATA -#define SAVESPACE 0 -#define SAVESPACEBIT 0 - -#undef import_array -#define import_array() { if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); } } diff --git a/include/numpy/ufunc_api.txt b/include/numpy/ufunc_api.txt deleted file mode 100644 index 3365433cd..000000000 --- a/include/numpy/ufunc_api.txt +++ /dev/null @@ -1,312 +0,0 @@ - -================= -Numpy Ufunc C-API -================= -:: - - PyObject * - PyUFunc_FromFuncAndData(PyUFuncGenericFunction *func, void - **data, char *types, int ntypes, int nin, int - nout, int identity, char *name, char *doc, int - check_return) - - -:: - - int - PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc, int - usertype, PyUFuncGenericFunction - function, int *arg_types, void *data) - - -:: - - int - PyUFunc_GenericFunction(PyUFuncObject *ufunc, PyObject *args, PyObject - *kwds, PyArrayObject **op) - - -This generic function is called with the ufunc object, the arguments to it, -and an array of (pointers to) PyArrayObjects which are NULL. - -'op' is an array of at least NPY_MAXARGS PyArrayObject *. - -:: - - void - PyUFunc_f_f_As_d_d(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_d_d(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_f_f(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_g_g(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_F_F_As_D_D(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_F_F(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_D_D(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_G_G(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_O_O(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_ff_f_As_dd_d(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_ff_f(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_dd_d(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_gg_g(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_FF_F_As_DD_D(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_DD_D(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_FF_F(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_GG_G(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_OO_O(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_O_O_method(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_OO_O_method(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_On_Om(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - int - PyUFunc_GetPyValues(char *name, int *bufsize, int *errmask, PyObject - **errobj) - - -On return, if errobj is populated with a non-NULL value, the caller -owns a new reference to errobj. - -:: - - int - PyUFunc_checkfperr(int errmask, PyObject *errobj, int *first) - - -:: - - void - PyUFunc_clearfperr() - - -:: - - int - PyUFunc_getfperr(void ) - - -:: - - int - PyUFunc_handlefperr(int errmask, PyObject *errobj, int retstatus, int - *first) - - -:: - - int - PyUFunc_ReplaceLoopBySignature(PyUFuncObject - *func, PyUFuncGenericFunction - newfunc, int - *signature, PyUFuncGenericFunction - *oldfunc) - - -:: - - PyObject * - PyUFunc_FromFuncAndDataAndSignature(PyUFuncGenericFunction *func, void - **data, char *types, int - ntypes, int nin, int nout, int - identity, char *name, char - *doc, int check_return, const char - *signature) - - -:: - - int - PyUFunc_SetUsesArraysAsData(void **data, size_t i) - - -:: - - void - PyUFunc_e_e(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_e_e_As_f_f(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_e_e_As_d_d(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_ee_e(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_ee_e_As_ff_f(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_ee_e_As_dd_d(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - int - PyUFunc_DefaultTypeResolver(PyUFuncObject *ufunc, NPY_CASTING - casting, PyArrayObject - **operands, PyObject - *type_tup, PyArray_Descr **out_dtypes) - - -This function applies the default type resolution rules -for the provided ufunc. - -Returns 0 on success, -1 on error. - -:: - - int - PyUFunc_ValidateCasting(PyUFuncObject *ufunc, NPY_CASTING - casting, PyArrayObject - **operands, PyArray_Descr **dtypes) - - -Validates that the input operands can be cast to -the input types, and the output types can be cast to -the output operands where provided. - -Returns 0 on success, -1 (with exception raised) on validation failure. - diff --git a/include/numpy/ufuncobject.h b/include/numpy/ufuncobject.h deleted file mode 100644 index 95afd5aa2..000000000 --- a/include/numpy/ufuncobject.h +++ /dev/null @@ -1,446 +0,0 @@ -#ifndef Py_UFUNCOBJECT_H -#define Py_UFUNCOBJECT_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * The legacy generic inner loop for a standard element-wise or - * generalized ufunc. - */ -typedef void (*PyUFuncGenericFunction) - (char **args, - npy_intp *dimensions, - npy_intp *strides, - void *innerloopdata); - -/* - * The most generic one-dimensional inner loop for - * a standard element-wise ufunc. This typedef is also - * more consistent with the other NumPy function pointer typedefs - * than PyUFuncGenericFunction. - */ -typedef void (PyUFunc_StridedInnerLoopFunc)( - char **dataptrs, npy_intp *strides, - npy_intp count, - NpyAuxData *innerloopdata); - -/* - * The most generic one-dimensional inner loop for - * a masked standard element-wise ufunc. "Masked" here means that it skips - * doing calculations on any items for which the maskptr array has a true - * value. - */ -typedef void (PyUFunc_MaskedStridedInnerLoopFunc)( - char **dataptrs, npy_intp *strides, - char *maskptr, npy_intp mask_stride, - npy_intp count, - NpyAuxData *innerloopdata); - -/* Forward declaration for the type resolver and loop selector typedefs */ -struct _tagPyUFuncObject; - -/* - * Given the operands for calling a ufunc, should determine the - * calculation input and output data types and return an inner loop function. - * This function should validate that the casting rule is being followed, - * and fail if it is not. - * - * For backwards compatibility, the regular type resolution function does not - * support auxiliary data with object semantics. The type resolution call - * which returns a masked generic function returns a standard NpyAuxData - * object, for which the NPY_AUXDATA_FREE and NPY_AUXDATA_CLONE macros - * work. - * - * ufunc: The ufunc object. - * casting: The 'casting' parameter provided to the ufunc. - * operands: An array of length (ufunc->nin + ufunc->nout), - * with the output parameters possibly NULL. - * type_tup: Either NULL, or the type_tup passed to the ufunc. - * out_dtypes: An array which should be populated with new - * references to (ufunc->nin + ufunc->nout) new - * dtypes, one for each input and output. These - * dtypes should all be in native-endian format. - * - * Should return 0 on success, -1 on failure (with exception set), - * or -2 if Py_NotImplemented should be returned. - */ -typedef int (PyUFunc_TypeResolutionFunc)( - struct _tagPyUFuncObject *ufunc, - NPY_CASTING casting, - PyArrayObject **operands, - PyObject *type_tup, - PyArray_Descr **out_dtypes); - -/* - * Given an array of DTypes as returned by the PyUFunc_TypeResolutionFunc, - * and an array of fixed strides (the array will contain NPY_MAX_INTP for - * strides which are not necessarily fixed), returns an inner loop - * with associated auxiliary data. - * - * For backwards compatibility, there is a variant of the inner loop - * selection which returns an inner loop irrespective of the strides, - * and with a void* static auxiliary data instead of an NpyAuxData * - * dynamically allocatable auxiliary data. - * - * ufunc: The ufunc object. - * dtypes: An array which has been populated with dtypes, - * in most cases by the type resolution funciton - * for the same ufunc. - * fixed_strides: For each input/output, either the stride that - * will be used every time the function is called - * or NPY_MAX_INTP if the stride might change or - * is not known ahead of time. The loop selection - * function may use this stride to pick inner loops - * which are optimized for contiguous or 0-stride - * cases. - * out_innerloop: Should be populated with the correct ufunc inner - * loop for the given type. - * out_innerloopdata: Should be populated with the void* data to - * be passed into the out_innerloop function. - * out_needs_api: If the inner loop needs to use the Python API, - * should set the to 1, otherwise should leave - * this untouched. - */ -typedef int (PyUFunc_LegacyInnerLoopSelectionFunc)( - struct _tagPyUFuncObject *ufunc, - PyArray_Descr **dtypes, - PyUFuncGenericFunction *out_innerloop, - void **out_innerloopdata, - int *out_needs_api); -typedef int (PyUFunc_InnerLoopSelectionFunc)( - struct _tagPyUFuncObject *ufunc, - PyArray_Descr **dtypes, - npy_intp *fixed_strides, - PyUFunc_StridedInnerLoopFunc **out_innerloop, - NpyAuxData **out_innerloopdata, - int *out_needs_api); -typedef int (PyUFunc_MaskedInnerLoopSelectionFunc)( - struct _tagPyUFuncObject *ufunc, - PyArray_Descr **dtypes, - PyArray_Descr *mask_dtype, - npy_intp *fixed_strides, - npy_intp fixed_mask_stride, - PyUFunc_MaskedStridedInnerLoopFunc **out_innerloop, - NpyAuxData **out_innerloopdata, - int *out_needs_api); - -typedef struct _tagPyUFuncObject { - PyObject_HEAD - /* - * nin: Number of inputs - * nout: Number of outputs - * nargs: Always nin + nout (Why is it stored?) - */ - int nin, nout, nargs; - - /* Identity for reduction, either PyUFunc_One or PyUFunc_Zero */ - int identity; - - /* Array of one-dimensional core loops */ - PyUFuncGenericFunction *functions; - /* Array of funcdata that gets passed into the functions */ - void **data; - /* The number of elements in 'functions' and 'data' */ - int ntypes; - - /* Does not appear to be used */ - int check_return; - - /* The name of the ufunc */ - char *name; - - /* Array of type numbers, of size ('nargs' * 'ntypes') */ - char *types; - - /* Documentation string */ - char *doc; - - void *ptr; - PyObject *obj; - PyObject *userloops; - - /* generalized ufunc parameters */ - - /* 0 for scalar ufunc; 1 for generalized ufunc */ - int core_enabled; - /* number of distinct dimension names in signature */ - int core_num_dim_ix; - - /* - * dimension indices of input/output argument k are stored in - * core_dim_ixs[core_offsets[k]..core_offsets[k]+core_num_dims[k]-1] - */ - - /* numbers of core dimensions of each argument */ - int *core_num_dims; - /* - * dimension indices in a flatted form; indices - * are in the range of [0,core_num_dim_ix) - */ - int *core_dim_ixs; - /* - * positions of 1st core dimensions of each - * argument in core_dim_ixs - */ - int *core_offsets; - /* signature string for printing purpose */ - char *core_signature; - - /* - * A function which resolves the types and fills an array - * with the dtypes for the inputs and outputs. - */ - PyUFunc_TypeResolutionFunc *type_resolver; - /* - * A function which returns an inner loop written for - * NumPy 1.6 and earlier ufuncs. This is for backwards - * compatibility, and may be NULL if inner_loop_selector - * is specified. - */ - PyUFunc_LegacyInnerLoopSelectionFunc *legacy_inner_loop_selector; - /* - * A function which returns an inner loop for the new mechanism - * in NumPy 1.7 and later. If provided, this is used, otherwise - * if NULL the legacy_inner_loop_selector is used instead. - */ - PyUFunc_InnerLoopSelectionFunc *inner_loop_selector; - /* - * A function which returns a masked inner loop for the ufunc. - */ - PyUFunc_MaskedInnerLoopSelectionFunc *masked_inner_loop_selector; -} PyUFuncObject; - -#include "arrayobject.h" - -#define UFUNC_ERR_IGNORE 0 -#define UFUNC_ERR_WARN 1 -#define UFUNC_ERR_RAISE 2 -#define UFUNC_ERR_CALL 3 -#define UFUNC_ERR_PRINT 4 -#define UFUNC_ERR_LOG 5 - - /* Python side integer mask */ - -#define UFUNC_MASK_DIVIDEBYZERO 0x07 -#define UFUNC_MASK_OVERFLOW 0x3f -#define UFUNC_MASK_UNDERFLOW 0x1ff -#define UFUNC_MASK_INVALID 0xfff - -#define UFUNC_SHIFT_DIVIDEBYZERO 0 -#define UFUNC_SHIFT_OVERFLOW 3 -#define UFUNC_SHIFT_UNDERFLOW 6 -#define UFUNC_SHIFT_INVALID 9 - - -/* platform-dependent code translates floating point - status to an integer sum of these values -*/ -#define UFUNC_FPE_DIVIDEBYZERO 1 -#define UFUNC_FPE_OVERFLOW 2 -#define UFUNC_FPE_UNDERFLOW 4 -#define UFUNC_FPE_INVALID 8 - -/* Error mode that avoids look-up (no checking) */ -#define UFUNC_ERR_DEFAULT 0 - -#define UFUNC_OBJ_ISOBJECT 1 -#define UFUNC_OBJ_NEEDS_API 2 - - /* Default user error mode */ -#define UFUNC_ERR_DEFAULT2 \ - (UFUNC_ERR_WARN << UFUNC_SHIFT_DIVIDEBYZERO) + \ - (UFUNC_ERR_WARN << UFUNC_SHIFT_OVERFLOW) + \ - (UFUNC_ERR_WARN << UFUNC_SHIFT_INVALID) - -#if NPY_ALLOW_THREADS -#define NPY_LOOP_BEGIN_THREADS do {if (!(loop->obj & UFUNC_OBJ_NEEDS_API)) _save = PyEval_SaveThread();} while (0); -#define NPY_LOOP_END_THREADS do {if (!(loop->obj & UFUNC_OBJ_NEEDS_API)) PyEval_RestoreThread(_save);} while (0); -#else -#define NPY_LOOP_BEGIN_THREADS -#define NPY_LOOP_END_THREADS -#endif - -/* - * UFunc has unit of 1, and the order of operations can be reordered - * This case allows reduction with multiple axes at once. - */ -#define PyUFunc_One 1 -/* - * UFunc has unit of 0, and the order of operations can be reordered - * This case allows reduction with multiple axes at once. - */ -#define PyUFunc_Zero 0 -/* - * UFunc has no unit, and the order of operations cannot be reordered. - * This case does not allow reduction with multiple axes at once. - */ -#define PyUFunc_None -1 -/* - * UFunc has no unit, and the order of operations can be reordered - * This case allows reduction with multiple axes at once. - */ -#define PyUFunc_ReorderableNone -2 - -#define UFUNC_REDUCE 0 -#define UFUNC_ACCUMULATE 1 -#define UFUNC_REDUCEAT 2 -#define UFUNC_OUTER 3 - - -typedef struct { - int nin; - int nout; - PyObject *callable; -} PyUFunc_PyFuncData; - -/* A linked-list of function information for - user-defined 1-d loops. - */ -typedef struct _loop1d_info { - PyUFuncGenericFunction func; - void *data; - int *arg_types; - struct _loop1d_info *next; -} PyUFunc_Loop1d; - - -#include "__ufunc_api.h" - -#define UFUNC_PYVALS_NAME "UFUNC_PYVALS" - -#define UFUNC_CHECK_ERROR(arg) \ - do {if ((((arg)->obj & UFUNC_OBJ_NEEDS_API) && PyErr_Occurred()) || \ - ((arg)->errormask && \ - PyUFunc_checkfperr((arg)->errormask, \ - (arg)->errobj, \ - &(arg)->first))) \ - goto fail;} while (0) - -/* This code checks the IEEE status flags in a platform-dependent way */ -/* Adapted from Numarray */ - -#if (defined(__unix__) || defined(unix)) && !defined(USG) -#include -#endif - -/* OSF/Alpha (Tru64) ---------------------------------------------*/ -#if defined(__osf__) && defined(__alpha) - -#include - -#define UFUNC_CHECK_STATUS(ret) { \ - unsigned long fpstatus; \ - \ - fpstatus = ieee_get_fp_control(); \ - /* clear status bits as well as disable exception mode if on */ \ - ieee_set_fp_control( 0 ); \ - ret = ((IEEE_STATUS_DZE & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((IEEE_STATUS_OVF & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((IEEE_STATUS_UNF & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((IEEE_STATUS_INV & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - } - -/* MS Windows -----------------------------------------------------*/ -#elif defined(_MSC_VER) - -#include - - /* Clear the floating point exception default of Borland C++ */ -#if defined(__BORLANDC__) -#define UFUNC_NOFPE _control87(MCW_EM, MCW_EM); -#endif - -#define UFUNC_CHECK_STATUS(ret) { \ - int fpstatus = (int) _clearfp(); \ - \ - ret = ((SW_ZERODIVIDE & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((SW_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((SW_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((SW_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - } - -/* Solaris --------------------------------------------------------*/ -/* --------ignoring SunOS ieee_flags approach, someone else can -** deal with that! */ -#elif defined(sun) || defined(__BSD__) || defined(__OpenBSD__) || \ - (defined(__FreeBSD__) && (__FreeBSD_version < 502114)) || \ - defined(__NetBSD__) -#include - -#define UFUNC_CHECK_STATUS(ret) { \ - int fpstatus; \ - \ - fpstatus = (int) fpgetsticky(); \ - ret = ((FP_X_DZ & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((FP_X_OFL & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((FP_X_UFL & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((FP_X_INV & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - (void) fpsetsticky(0); \ - } - -#elif defined(__GLIBC__) || defined(__APPLE__) || \ - defined(__CYGWIN__) || defined(__MINGW32__) || \ - (defined(__FreeBSD__) && (__FreeBSD_version >= 502114)) - -#if defined(__GLIBC__) || defined(__APPLE__) || \ - defined(__MINGW32__) || defined(__FreeBSD__) -#include -#endif - -#define UFUNC_CHECK_STATUS(ret) { \ - int fpstatus = (int) fetestexcept(FE_DIVBYZERO | FE_OVERFLOW | \ - FE_UNDERFLOW | FE_INVALID); \ - ret = ((FE_DIVBYZERO & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((FE_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((FE_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((FE_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - (void) feclearexcept(FE_DIVBYZERO | FE_OVERFLOW | \ - FE_UNDERFLOW | FE_INVALID); \ -} - -#elif defined(_AIX) - -#include -#include - -#define UFUNC_CHECK_STATUS(ret) { \ - fpflag_t fpstatus; \ - \ - fpstatus = fp_read_flag(); \ - ret = ((FP_DIV_BY_ZERO & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((FP_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((FP_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((FP_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - fp_swap_flag(0); \ -} - -#else - -#define NO_FLOATING_POINT_SUPPORT -#define UFUNC_CHECK_STATUS(ret) { \ - ret = 0; \ - } - -#endif - -/* - * THESE MACROS ARE DEPRECATED. - * Use npy_set_floatstatus_* in the npymath library. - */ -#define generate_divbyzero_error() npy_set_floatstatus_divbyzero() -#define generate_overflow_error() npy_set_floatstatus_overflow() - - /* Make sure it gets defined if it isn't already */ -#ifndef UFUNC_NOFPE -#define UFUNC_NOFPE -#endif - - -#ifdef __cplusplus -} -#endif -#endif /* !Py_UFUNCOBJECT_H */ diff --git a/include/numpy/utils.h b/include/numpy/utils.h deleted file mode 100644 index cc968a354..000000000 --- a/include/numpy/utils.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __NUMPY_UTILS_HEADER__ -#define __NUMPY_UTILS_HEADER__ - -#ifndef __COMP_NPY_UNUSED - #if defined(__GNUC__) - #define __COMP_NPY_UNUSED __attribute__ ((__unused__)) - # elif defined(__ICC) - #define __COMP_NPY_UNUSED __attribute__ ((__unused__)) - #else - #define __COMP_NPY_UNUSED - #endif -#endif - -/* Use this to tag a variable as not used. It will remove unused variable - * warning on support platforms (see __COM_NPY_UNUSED) and mangle the variable - * to avoid accidental use */ -#define NPY_UNUSED(x) (__NPY_UNUSED_TAGGED ## x) __COMP_NPY_UNUSED - -#endif diff --git a/pyproject.toml b/pyproject.toml index 77deb44b0..d23730b00 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,10 +6,9 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a29,<8.0.0a40", + "thinc>=8.0.0a30,<8.0.0a40", "blis>=0.4.0,<0.5.0", "pytokenizations", - "smart_open>=2.0.0,<3.0.0", "pathy" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 5aafd83dd..9b108de8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,13 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a29,<8.0.0a40 +thinc>=8.0.0a30,<8.0.0a40 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 -wasabi>=0.7.1,<1.1.0 +wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 -catalogue>=0.0.7,<1.1.0 +catalogue>=2.0.1,<2.1.0 typer>=0.3.0,<0.4.0 pathy # Third party dependencies @@ -16,7 +16,6 @@ requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 pydantic>=1.3.0,<2.0.0 pytokenizations -smart_open>=2.0.0,<3.0.0 # Official Python utilities setuptools packaging diff --git a/setup.cfg b/setup.cfg index 8b4819ed8..fc33abedb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,17 +34,17 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a29,<8.0.0a40 + thinc>=8.0.0a30,<8.0.0a40 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a29,<8.0.0a40 + thinc>=8.0.0a30,<8.0.0a40 blis>=0.4.0,<0.5.0 - wasabi>=0.7.1,<1.1.0 + wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 - catalogue>=0.0.7,<1.1.0 + catalogue>=2.0.1,<2.1.0 typer>=0.3.0,<0.4.0 pathy # Third-party dependencies @@ -53,7 +53,6 @@ install_requires = requests>=2.13.0,<3.0.0 pydantic>=1.3.0,<2.0.0 pytokenizations - smart_open>=2.0.0,<3.0.0 # Official Python utilities setuptools packaging diff --git a/setup.py b/setup.py index af4cd0ec6..d448a262c 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,6 @@ import sys import platform from distutils.command.build_ext import build_ext from distutils.sysconfig import get_python_inc -from distutils import ccompiler, msvccompiler import numpy from pathlib import Path import shutil @@ -195,13 +194,7 @@ def setup_package(): include_dirs = [ get_python_inc(plat_specific=True), numpy.get_include(), - str(ROOT / "include"), ] - if ( - ccompiler.new_compiler().compiler_type == "msvc" - and msvccompiler.get_build_version() == 9 - ): - include_dirs.append(str(ROOT / "include" / "msvc9")) ext_modules = [] for name in MOD_NAMES: mod_path = name.replace(".", "/") + ".pyx" diff --git a/spacy/__init__.py b/spacy/__init__.py index d07ee5674..5c286ed80 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -20,24 +20,30 @@ from .errors import Errors from .language import Language from . import util + if sys.maxunicode == 65535: raise SystemError(Errors.E130) def load( name: Union[str, Path], - disable: Iterable[str] = tuple(), + disable: Iterable[str] = util.SimpleFrozenList(), + exclude: Iterable[str] = util.SimpleFrozenList(), config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), ) -> Language: """Load a spaCy model from an installed package or a local path. name (str): Package name or model path. - disable (Iterable[str]): Names of pipeline components to disable. + disable (Iterable[str]): Names of pipeline components to disable. Disabled + pipes will be loaded but they won't be run unless you explicitly + enable them by calling nlp.enable_pipe. + exclude (Iterable[str]): Names of pipeline components to exclude. Excluded + components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. RETURNS (Language): The loaded nlp object. """ - return util.load_model(name, disable=disable, config=config) + return util.load_model(name, disable=disable, exclude=exclude, config=config) def blank(name: str, **overrides) -> Language: diff --git a/spacy/about.py b/spacy/about.py index da3e32805..418e44c1d 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a10" +__version__ = "3.0.0a12" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 4c3adc5d3..b47c1c16b 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -11,6 +11,7 @@ from .profile import profile # noqa: F401 from .train import train_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 +from .debug_config import debug_config # noqa: F401 from .debug_model import debug_model # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 @@ -23,6 +24,7 @@ from .project.run import project_run # noqa: F401 from .project.dvc import project_update_dvc # noqa: F401 from .project.push import project_push # noqa: F401 from .project.pull import project_pull # noqa: F401 +from .project.document import project_document # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index b527ac2a0..16e257ce2 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,17 +1,19 @@ from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING import sys +import shutil from pathlib import Path from wasabi import msg import srsly import hashlib import typer +from click import NoSuchOption from typer.main import get_command from contextlib import contextmanager from thinc.config import Config, ConfigValidationError from configparser import InterpolationError from ..schemas import ProjectConfigSchema, validate -from ..util import import_file +from ..util import import_file, run_command, make_tempdir if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -71,9 +73,10 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]: opt = args.pop(0) err = f"Invalid CLI argument '{opt}'" if opt.startswith("--"): # new argument + orig_opt = opt opt = opt.replace("--", "") if "." not in opt: - msg.fail(f"{err}: can't override top-level section", exits=1) + raise NoSuchOption(orig_opt) if "=" in opt: # we have --opt=value opt, value = opt.split("=", 1) opt = opt.replace("-", "_") @@ -194,7 +197,7 @@ def get_checksum(path: Union[Path, str]) -> str: for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()): dir_checksum.update(sub_file.read_bytes()) return dir_checksum.hexdigest() - raise ValueError(f"Can't get checksum for {path}: not a file or directory") + msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1) @contextmanager @@ -260,8 +263,10 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None: src (Path): The source path. url (str): The destination URL to upload to. """ - dest = ensure_pathy(dest) - with dest.open(mode="wb") as output_file: + import smart_open + + dest = str(dest) + with smart_open.open(dest, mode="wb") as output_file: with src.open(mode="rb") as input_file: output_file.write(input_file.read()) @@ -274,10 +279,12 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) force (bool): Whether to force download even if file exists. If False, the download will be skipped. """ + import smart_open + if dest.exists() and not force: return None - src = ensure_pathy(src) - with src.open(mode="rb") as input_file: + src = str(src) + with smart_open.open(src, mode="rb") as input_file: with dest.open(mode="wb") as output_file: output_file.write(input_file.read()) @@ -288,3 +295,49 @@ def ensure_pathy(path): from pathy import Pathy # noqa: F811 return Pathy(path) + + +def git_sparse_checkout( + repo: str, subpath: str, dest: Path, *, branch: Optional[str] = None +): + if dest.exists(): + msg.fail("Destination of checkout must not exist", exits=1) + if not dest.parent.exists(): + raise IOError("Parent of destination of checkout must exist") + # We're using Git, partial clone and sparse checkout to + # only clone the files we need + # This ends up being RIDICULOUS. omg. + # So, every tutorial and SO post talks about 'sparse checkout'...But they + # go and *clone* the whole repo. Worthless. And cloning part of a repo + # turns out to be completely broken. The only way to specify a "path" is.. + # a path *on the server*? The contents of which, specifies the paths. Wat. + # Obviously this is hopelessly broken and insecure, because you can query + # arbitrary paths on the server! So nobody enables this. + # What we have to do is disable *all* files. We could then just checkout + # the path, and it'd "work", but be hopelessly slow...Because it goes and + # transfers every missing object one-by-one. So the final piece is that we + # need to use some weird git internals to fetch the missings in bulk, and + # *that* we can do by path. + # We're using Git and sparse checkout to only clone the files we need + with make_tempdir() as tmp_dir: + # This is the "clone, but don't download anything" part. + cmd = ( + f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " + "--filter=blob:none" # <-- The key bit + ) + if branch is not None: + cmd = f"{cmd} -b {branch}" + run_command(cmd, capture=True) + # Now we need to find the missing filenames for the subpath we want. + # Looking for this 'rev-list' command in the git --help? Hah. + cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" + ret = run_command(cmd, capture=True) + missings = "\n".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) + # Now pass those missings into another bit of git internals + run_command( + f"git -C {tmp_dir} fetch-pack --stdin {repo}", capture=True, stdin=missings + ) + # And finally, we can checkout our subpath + run_command(f"git -C {tmp_dir} checkout {branch} {subpath}") + # We need Path(name) to make sure we also support subdirectories + shutil.move(str(tmp_dir / Path(subpath)), str(dest)) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 864051240..f73c2f2c0 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -50,6 +50,7 @@ def convert_cli( converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), + concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"), # fmt: on ): """ @@ -82,6 +83,7 @@ def convert_cli( converter=converter, ner_map=ner_map, lang=lang, + concatenate=concatenate, silent=silent, msg=msg, ) @@ -100,13 +102,15 @@ def convert( converter: str = "auto", ner_map: Optional[Path] = None, lang: Optional[str] = None, + concatenate: bool = False, silent: bool = True, msg: Optional[Printer], ) -> None: if not msg: msg = Printer(no_print=silent) ner_map = srsly.read_json(ner_map) if ner_map is not None else None - for input_loc in walk_directory(Path(input_path)): + doc_files = [] + for input_loc in walk_directory(Path(input_path), converter): input_data = input_loc.open("r", encoding="utf-8").read() # Use converter function to convert data func = CONVERTERS[converter] @@ -121,6 +125,13 @@ def convert( no_print=silent, ner_map=ner_map, ) + doc_files.append((input_loc, docs)) + if concatenate: + all_docs = [] + for _, docs in doc_files: + all_docs.extend(docs) + doc_files = [(input_path, all_docs)] + for input_loc, docs in doc_files: if file_type == "json": data = [docs_to_json(docs)] else: @@ -174,7 +185,7 @@ def autodetect_ner_format(input_data: str) -> Optional[str]: return None -def walk_directory(path: Path) -> List[Path]: +def walk_directory(path: Path, converter: str) -> List[Path]: if not path.is_dir(): return [path] paths = [path] @@ -188,6 +199,12 @@ def walk_directory(path: Path) -> List[Path]: continue elif path.is_dir(): paths.extend(path.iterdir()) + elif converter == "json" and not path.parts[-1].endswith("json"): + continue + elif converter == "conll" and not path.parts[-1].endswith("conll"): + continue + elif converter == "iob" and not path.parts[-1].endswith("iob"): + continue else: locs.append(path) return locs @@ -214,11 +231,11 @@ def verify_cli_args( if ner_map is not None and not Path(ner_map).exists(): msg.fail("NER map not found", ner_map, exits=1) if input_path.is_dir(): - input_locs = walk_directory(input_path) + input_locs = walk_directory(input_path, converter) if len(input_locs) == 0: msg.fail("No input files in directory", input_path, exits=1) file_types = list(set([loc.suffix[1:] for loc in input_locs])) - if len(file_types) >= 2: + if converter == "auto" and len(file_types) >= 2: file_types = ",".join(file_types) msg.fail("All input files must be same type", file_types, exits=1) if converter != "auto" and converter not in CONVERTERS: @@ -227,7 +244,7 @@ def verify_cli_args( def _get_converter(msg, converter, input_path): if input_path.is_dir(): - input_path = walk_directory(input_path)[0] + input_path = walk_directory(input_path, converter)[0] if converter == "auto": converter = input_path.suffix[1:] if converter == "ner" or converter == "iob": diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py new file mode 100644 index 000000000..2944cd364 --- /dev/null +++ b/spacy/cli/debug_config.py @@ -0,0 +1,93 @@ +from typing import Optional, Dict, Any, Union, List +from pathlib import Path +from wasabi import msg, table +from thinc.api import Config +from thinc.config import VARIABLE_RE +import typer + +from ._util import Arg, Opt, show_validation_error, parse_config_overrides +from ._util import import_code, debug_cli +from .. import util + + +@debug_cli.command( + "config", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def debug_config_cli( + # fmt: off + ctx: typer.Context, # This is only used to read additional arguments + config_path: Path = Arg(..., help="Path to config file", exists=True), + code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"), + show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.") + # fmt: on +): + """Debug a config.cfg file and show validation errors. The command will + create all objects in the tree and validate them. Note that some config + validation errors are blocking and will prevent the rest of the config from + being resolved. This means that you may not see all validation errors at + once and some issues are only shown once previous errors have been fixed. + Similar as with the 'train' command, you can override settings from the config + as command line options. For instance, --training.batch_size 128 overrides + the value of "batch_size" in the block "[training]". + """ + overrides = parse_config_overrides(ctx.args) + import_code(code_path) + debug_config( + config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars + ) + + +def debug_config( + config_path: Path, + *, + overrides: Dict[str, Any] = {}, + show_funcs: bool = False, + show_vars: bool = False, +): + msg.divider("Config validation") + with show_validation_error(config_path): + config = util.load_config(config_path, overrides=overrides) + nlp, _ = util.load_model_from_config(config) + msg.good("Config is valid") + if show_vars: + variables = get_variables(config) + msg.divider(f"Variables ({len(variables)})") + head = ("Variable", "Value") + msg.table(variables, header=head, divider=True, widths=(41, 34), spacing=2) + if show_funcs: + funcs = get_registered_funcs(config) + msg.divider(f"Registered functions ({len(funcs)})") + for func in funcs: + func_data = { + "Registry": f"@{func['registry']}", + "Name": func["name"], + "Module": func["module"], + "File": f"{func['file']} (line {func['line_no']})", + } + msg.info(f"[{func['path']}]") + print(table(func_data).strip()) + + +def get_registered_funcs(config: Config) -> List[Dict[str, Optional[Union[str, int]]]]: + result = [] + for key, value in util.walk_dict(config): + if not key[-1].startswith("@"): + continue + # We have a reference to a registered function + reg_name = key[-1][1:] + registry = getattr(util.registry, reg_name) + path = ".".join(key[:-1]) + info = registry.find(value) + result.append({"name": value, "registry": reg_name, "path": path, **info}) + return result + + +def get_variables(config: Config) -> Dict[str, Any]: + result = {} + for variable in sorted(set(VARIABLE_RE.findall(config.to_str()))): + path = variable[2:-1].replace(":", ".") + value = util.dot_to_object(config, path) + result[variable] = repr(value) + return result diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index b23705311..2f48a29cd 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -23,34 +23,6 @@ BLANK_MODEL_MIN_THRESHOLD = 100 BLANK_MODEL_THRESHOLD = 2000 -@debug_cli.command( - "config", - context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -) -def debug_config_cli( - # fmt: off - ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True), - code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - # fmt: on -): - """Debug a config.cfg file and show validation errors. The command will - create all objects in the tree and validate them. Note that some config - validation errors are blocking and will prevent the rest of the config from - being resolved. This means that you may not see all validation errors at - once and some issues are only shown once previous errors have been fixed. - Similar as with the 'train' command, you can override settings from the config - as command line options. For instance, --training.batch_size 128 overrides - the value of "batch_size" in the block "[training]". - """ - overrides = parse_config_overrides(ctx.args) - import_code(code_path) - with show_validation_error(config_path): - config = util.load_config(config_path, overrides=overrides) - nlp, _ = util.load_model_from_config(config) - msg.good("Original config is valid") - - @debug_cli.command( "data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, ) diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 604a5676a..ed8d54655 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -51,7 +51,7 @@ def debug_model_cli( with show_validation_error(config_path): config = util.load_config(config_path, overrides=config_overrides) nlp, config = util.load_model_from_config(config_path) - seed = config["pretraining"]["seed"] + seed = config["training"]["seed"] if seed is not None: msg.info(f"Fixing random seed: {seed}") fix_random_seed(seed) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index ca2067edf..ca082b939 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,7 +1,7 @@ from typing import Optional, Dict, Any, Union import platform from pathlib import Path -from wasabi import Printer +from wasabi import Printer, MarkdownRenderer import srsly from ._util import app, Arg, Opt @@ -97,12 +97,13 @@ def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str: title (str / None): Title, will be rendered as headline 2. RETURNS (str): The Markdown string. """ - markdown = [] + md = MarkdownRenderer() + if title: + md.add(md.title(2, title)) + items = [] for key, value in data.items(): if isinstance(value, str) and Path(value).exists(): continue - markdown.append(f"* **{key}:** {value}") - result = "\n{}\n".format("\n".join(markdown)) - if title: - result = f"\n## {title}\n{result}" - return result + items.append(f"{md.bold(f'{key}:')} {value}") + md.add(md.list(items)) + return f"\n{md.text}\n" diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 94e0bd6fc..b5335df51 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -7,6 +7,7 @@ import srsly import re from .. import util +from ..language import DEFAULT_CONFIG_PRETRAIN_PATH from ..schemas import RecommendationSchema from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND @@ -48,6 +49,7 @@ def init_fill_config_cli( # fmt: off base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False), output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True), + pretraining: bool = Opt(False, "--pretraining", "-p", help="Include config for pretraining (with 'spacy pretrain')"), diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes") # fmt: on ): @@ -58,19 +60,27 @@ def init_fill_config_cli( can be used with a config generated via the training quickstart widget: https://nightly.spacy.io/usage/training#quickstart """ - fill_config(output_file, base_path, diff=diff) + fill_config(output_file, base_path, pretraining=pretraining, diff=diff) def fill_config( - output_file: Path, base_path: Path, *, diff: bool = False + output_file: Path, base_path: Path, *, pretraining: bool = False, diff: bool = False ) -> Tuple[Config, Config]: is_stdout = str(output_file) == "-" msg = Printer(no_print=is_stdout) with show_validation_error(hint_fill=False): config = util.load_config(base_path) - nlp, _ = util.load_model_from_config(config, auto_fill=True) + nlp, _ = util.load_model_from_config(config, auto_fill=True, validate=False) + # Load a second time with validation to be extra sure that the produced + # config result is a valid config + nlp, _ = util.load_model_from_config(nlp.config) + filled = nlp.config + if pretraining: + validate_config_for_pretrain(filled, msg) + pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) + filled = pretrain_config.merge(filled) before = config.to_str() - after = nlp.config.to_str() + after = filled.to_str() if before == after: msg.warn("Nothing to auto-fill: base config is already complete") else: @@ -84,7 +94,7 @@ def fill_config( print(diff_strings(before, after)) msg.divider("END CONFIG DIFF") print("") - save_config(nlp.config, output_file, is_stdout=is_stdout) + save_config(filled, output_file, is_stdout=is_stdout) def init_config( @@ -132,12 +142,9 @@ def init_config( msg.info("Generated template specific for your use case") for label, value in use_case.items(): msg.text(f"- {label}: {value}") - use_transformer = bool(template_vars.use_transformer) with show_validation_error(hint_fill=False): config = util.load_config_from_str(base_template) nlp, _ = util.load_model_from_config(config, auto_fill=True) - if use_transformer: - nlp.config.pop("pretraining", {}) # TODO: solve this better msg.good("Auto-filled config with all values") save_config(nlp.config, output_file, is_stdout=is_stdout) @@ -147,6 +154,8 @@ def save_config(config: Config, output_file: Path, is_stdout: bool = False) -> N if is_stdout: print(config.to_str()) else: + if not output_file.parent.exists(): + output_file.parent.mkdir(parents=True) config.to_disk(output_file, interpolate=False) msg.good("Saved config", output_file) msg.text("You can now add your data and train your model:") @@ -161,3 +170,15 @@ def has_spacy_transformers() -> bool: return True except ImportError: return False + + +def validate_config_for_pretrain(config: Config, msg: Printer) -> None: + if "tok2vec" not in config["nlp"]["pipeline"]: + msg.warn( + "No tok2vec component found in the pipeline. If your tok2vec " + "component has a different name, you may need to adjust the " + "tok2vec_model reference in the [pretraining] block. If you don't " + "have a tok2vec component, make sure to add it to your [components] " + "and the pipeline specified in the [nlp] block, so you can pretrain " + "weights for it." + ) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 523e8a99a..4e5038951 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -77,7 +77,9 @@ def package( meta = generate_meta(meta, msg) errors = validate(ModelMetaSchema, meta) if errors: - msg.fail("Invalid model meta.json", "\n".join(errors), exits=1) + msg.fail("Invalid model meta.json") + print("\n".join(errors)) + sys.exit(1) model_name = meta["lang"] + "_" + meta["name"] model_name_v = model_name + "-" + meta["version"] main_path = output_dir / model_name_v @@ -118,10 +120,10 @@ def get_meta( "lang": "en", "name": "model", "version": "0.0.0", - "description": None, - "author": None, - "email": None, - "url": None, + "description": "", + "author": "", + "email": "", + "url": "", "license": "MIT", } meta.update(existing_meta) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 82950f402..5f20773e1 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -90,19 +90,20 @@ def pretrain( with show_validation_error(config_path): config = util.load_config(config_path, overrides=config_overrides) nlp, config = util.load_model_from_config(config) - # TODO: validate that [pretraining] block exists + pretrain_config = config["pretraining"] + if not pretrain_config: + # TODO: What's the solution here? How do we handle optional blocks? + msg.fail("The [pretraining] block in your config is empty", exits=1) if not output_dir.exists(): output_dir.mkdir() msg.good(f"Created output directory: {output_dir}") - seed = config["pretraining"]["seed"] + seed = pretrain_config["seed"] if seed is not None: fix_random_seed(seed) - if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]: + if use_gpu >= 0 and pretrain_config["use_pytorch_for_gpu_memory"]: use_pytorch_for_gpu_memory() config.to_disk(output_dir / "config.cfg") msg.good("Saved config file in the output directory") - pretrain_config = config["pretraining"] - if texts_loc != "-": # reading from a file with msg.loading("Loading input texts..."): texts = list(srsly.read_jsonl(texts_loc)) diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index 60cf95160..e33a82acc 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -7,16 +7,7 @@ import requests from ...util import ensure_path, working_dir from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum -from .._util import download_file - - -# TODO: find a solution for caches -# CACHES = [ -# Path.home() / ".torch", -# Path.home() / ".caches" / "torch", -# os.environ.get("TORCH_HOME"), -# Path.home() / ".keras", -# ] +from .._util import download_file, git_sparse_checkout @project_cli.command("assets") @@ -45,14 +36,29 @@ def project_assets(project_dir: Path) -> None: msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) msg.info(f"Fetching {len(assets)} asset(s)") for asset in assets: - dest = asset["dest"] - url = asset.get("url") + dest = Path(asset["dest"]) checksum = asset.get("checksum") - if not url: - # project.yml defines asset without URL that the user has to place - check_private_asset(dest, checksum) - continue - fetch_asset(project_path, url, dest, checksum) + if "git" in asset: + if dest.exists(): + # If there's already a file, check for checksum + if checksum and checksum == get_checksum(dest): + msg.good(f"Skipping download with matching checksum: {dest}") + continue + else: + shutil.rmtree(dest) + git_sparse_checkout( + asset["git"]["repo"], + asset["git"]["path"], + dest, + branch=asset["git"].get("branch"), + ) + else: + url = asset.get("url") + if not url: + # project.yml defines asset without URL that the user has to place + check_private_asset(dest, checksum) + continue + fetch_asset(project_path, url, dest, checksum) def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None: @@ -84,7 +90,6 @@ def fetch_asset( RETURNS (Optional[Path]): The path to the fetched asset or None if fetching the asset failed. """ - # TODO: add support for caches dest_path = (project_path / dest).resolve() if dest_path.exists() and checksum: # If there's already a file, check for checksum @@ -119,7 +124,7 @@ def convert_asset_url(url: str) -> str: RETURNS (str): The converted URL. """ # If the asset URL is a regular GitHub URL it's likely a mistake - if re.match(r"(http(s?)):\/\/github.com", url): + if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url: converted = url.replace("github.com", "raw.githubusercontent.com") converted = re.sub(r"/(tree|blob)/", "/", converted) msg.warn( diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py index bb9ba74cb..7f9a46a46 100644 --- a/spacy/cli/project/clone.py +++ b/spacy/cli/project/clone.py @@ -2,12 +2,12 @@ from typing import Optional from pathlib import Path from wasabi import msg import subprocess -import shutil import re from ... import about -from ...util import ensure_path, run_command, make_tempdir +from ...util import ensure_path from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE +from .._util import git_sparse_checkout @project_cli.command("clone") @@ -39,24 +39,11 @@ def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> N check_clone(name, dest, repo) project_dir = dest.resolve() repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo) - # We're using Git and sparse checkout to only clone the files we need - with make_tempdir() as tmp_dir: - cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" - try: - run_command(cmd) - except subprocess.CalledProcessError: - err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'." - msg.fail(err) - with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: - f.write(name) - try: - run_command(["git", "-C", str(tmp_dir), "fetch"]) - run_command(["git", "-C", str(tmp_dir), "checkout"]) - except subprocess.CalledProcessError: - err = f"Could not clone '{name}' from repo '{repo_name}'" - msg.fail(err) - # We need Path(name) to make sure we also support subdirectories - shutil.move(str(tmp_dir / Path(name)), str(project_dir)) + try: + git_sparse_checkout(repo, name, dest) + except subprocess.CalledProcessError: + err = f"Could not clone '{name}' from repo '{repo_name}'" + msg.fail(err) msg.good(f"Cloned '{name}' from {repo_name}", project_dir) if not (project_dir / PROJECT_FILE).exists(): msg.warn(f"No {PROJECT_FILE} found in directory") diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py new file mode 100644 index 000000000..ffa77b2d5 --- /dev/null +++ b/spacy/cli/project/document.py @@ -0,0 +1,108 @@ +from pathlib import Path +from wasabi import msg, MarkdownRenderer + +from ...util import working_dir +from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config + + +DOCS_URL = "https://nightly.spacy.io" +INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the +project, as well as the available commands and workflows. For details, see the +[spaCy projects documentation]({DOCS_URL}/usage/projects).""" +INTRO_COMMANDS = f"""The following commands are defined by the project. They +can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run). +Commands are only re-run if their inputs have changed.""" +INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They +can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run) +and will run the specified commands in order. Commands are only re-run if their +inputs have changed.""" +INTRO_ASSETS = f"""The following assets are defined by the project. They can +be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets) +in the project directory.""" +# These markers are added to the Markdown and can be used to update the file in +# place if it already exists. Only the auto-generated part will be replaced. +MARKER_START = "" +MARKER_END = "" + + +@project_cli.command("document") +def project_document_cli( + # fmt: off + project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), + output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"), + no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji") + # fmt: on +): + """ + Auto-generate a README.md for a project. If the content is saved to a file, + hidden markers are added so you can add custom content before or after the + auto-generated section and only the auto-generated docs will be replaced + when you re-run the command. + """ + project_document(project_dir, output_file, no_emoji=no_emoji) + + +def project_document( + project_dir: Path, output_file: Path, *, no_emoji: bool = False +) -> None: + is_stdout = str(output_file) == "-" + config = load_project_config(project_dir) + md = MarkdownRenderer(no_emoji=no_emoji) + md.add(MARKER_START) + title = config.get("title") + description = config.get("description") + md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐")) + if description: + md.add(description) + md.add(md.title(2, PROJECT_FILE, "📋")) + md.add(INTRO_PROJECT) + # Commands + cmds = config.get("commands", []) + data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds] + if data: + md.add(md.title(3, "Commands", "⏯")) + md.add(INTRO_COMMANDS) + md.add(md.table(data, ["Command", "Description"])) + # Workflows + wfs = config.get("workflows", {}).items() + data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs] + if data: + md.add(md.title(3, "Workflows", "⏭")) + md.add(INTRO_WORKFLOWS) + md.add(md.table(data, ["Workflow", "Steps"])) + # Assets + assets = config.get("assets", []) + data = [] + for a in assets: + source = "Git" if a.get("git") else "URL" if a.get("url") else "Local" + dest_path = a["dest"] + dest = md.code(dest_path) + if source == "Local": + # Only link assets if they're in the repo + with working_dir(project_dir) as p: + if (p / dest_path).exists(): + dest = md.link(dest, dest_path) + data.append((dest, source, a.get("description", ""))) + if data: + md.add(md.title(3, "Assets", "🗂")) + md.add(INTRO_ASSETS) + md.add(md.table(data, ["File", "Source", "Description"])) + md.add(MARKER_END) + # Output result + if is_stdout: + print(md.text) + else: + content = md.text + if output_file.exists(): + with output_file.open("r", encoding="utf8") as f: + existing = f.read() + if MARKER_START in existing and MARKER_END in existing: + msg.info("Found existing file: only replacing auto-generated docs") + before = existing.split(MARKER_START)[0] + after = existing.split(MARKER_END)[1] + content = f"{before}{content}{after}" + else: + msg.info("Replacing existing file") + with output_file.open("w") as f: + f.write(content) + msg.good("Saved project documentation", output_file) diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py index e0f6cd430..de0480bad 100644 --- a/spacy/cli/project/dvc.py +++ b/spacy/cli/project/dvc.py @@ -1,6 +1,6 @@ """This module contains helpers and subcommands for integrating spaCy projects with Data Version Controk (DVC). https://dvc.org""" -from typing import Dict, Any, List, Optional +from typing import Dict, Any, List, Optional, Iterable import subprocess from pathlib import Path from wasabi import msg @@ -8,6 +8,7 @@ from wasabi import msg from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli from .._util import Arg, Opt, NAME, COMMAND from ...util import working_dir, split_command, join_command, run_command +from ...util import SimpleFrozenList DVC_CONFIG = "dvc.yaml" @@ -130,7 +131,7 @@ def update_dvc_config( def run_dvc_commands( - commands: List[str] = tuple(), flags: Dict[str, bool] = {}, + commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}, ) -> None: """Run a sequence of DVC commands in a subprocess, in order. diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py index 73cb46bb7..6c0f32171 100644 --- a/spacy/cli/project/pull.py +++ b/spacy/cli/project/pull.py @@ -4,6 +4,7 @@ from .remote_storage import RemoteStorage from .remote_storage import get_command_hash from .._util import project_cli, Arg from .._util import load_project_config +from .run import update_lockfile @project_cli.command("pull") @@ -13,7 +14,7 @@ def project_pull_cli( project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), # fmt: on ): - """Retrieve any precomputed outputs from a remote storage that are available. + """Retrieve available precomputed outputs from a remote storage. You can alias remotes in your project.yml by mapping them to storage paths. A storage can be anything that the smart-open library can upload to, e.g. gcs, aws, ssh, local directories etc @@ -36,3 +37,6 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): for output_path in cmd.get("outputs", []): url = storage.pull(output_path, command_hash=cmd_hash) yield url, output_path + + if cmd.get("outptus") and all(loc.exists() for loc in cmd["outputs"]): + update_lockfile(project_dir, cmd) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 6e1deeeee..bacd7f04b 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -1,10 +1,11 @@ -from typing import Optional, List, Dict, Sequence, Any +from typing import Optional, List, Dict, Sequence, Any, Iterable from pathlib import Path from wasabi import msg import sys import srsly from ...util import working_dir, run_command, split_command, is_cwd, join_command +from ...util import SimpleFrozenList from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash from .._util import get_checksum, project_cli, Arg, Opt, COMMAND @@ -101,6 +102,9 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: print(f"For command details, run: {help_cmd}") else: print("") + title = config.get("title") + if title: + print(f"{title}\n") if config_commands: print(f"Available commands in {PROJECT_FILE}") print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}") @@ -112,7 +116,9 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: def run_commands( - commands: List[str] = tuple(), silent: bool = False, dry: bool = False, + commands: Iterable[str] = SimpleFrozenList(), + silent: bool = False, + dry: bool = False, ) -> None: """Run a sequence of commands in a subprocess, in order. diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 15188cd4e..d9ab8eca5 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -18,9 +18,6 @@ from .. import util from ..gold.example import Example from ..errors import Errors -# Don't remove - required to load the built-in architectures -from ..ml import models # noqa: F401 - @app.command( "train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True} @@ -96,6 +93,7 @@ def train( train_corpus = T_cfg["train_corpus"] dev_corpus = T_cfg["dev_corpus"] batcher = T_cfg["batcher"] + train_logger = T_cfg["logger"] # Components that shouldn't be updated during training frozen_components = T_cfg["frozen_components"] # Sourced components that require resume_training @@ -117,7 +115,7 @@ def train( # Load a pretrained tok2vec model - cf. CLI command 'pretrain' if weights_data is not None: - tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None) + tok2vec_path = config["pretraining"].get("tok2vec_model", None) if tok2vec_path is None: msg.fail( f"To use a pretrained tok2vec model, the config needs to specify which " @@ -149,10 +147,11 @@ def train( exclude=frozen_components, ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") - print_row = setup_printer(T_cfg, nlp) + print_row, finalize_logger = train_logger(nlp) try: progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) + progress.set_description(f"Epoch 1") for batch, info, is_best_checkpoint in training_step_iterator: progress.update(1) if is_best_checkpoint is not None: @@ -162,7 +161,9 @@ def train( update_meta(T_cfg, nlp, info) nlp.to_disk(output_path / "model-best") progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) + progress.set_description(f"Epoch {info['epoch']}") except Exception as e: + finalize_logger() if output_path is not None: # We don't want to swallow the traceback if we don't have a # specific error. @@ -173,6 +174,7 @@ def train( nlp.to_disk(output_path / "model-final") raise e finally: + finalize_logger() if output_path is not None: final_model_path = output_path / "model-final" if optimizer.averages: @@ -203,7 +205,7 @@ def create_train_batches(iterator, batcher, max_epochs: int): def create_evaluation_callback( - nlp: Language, dev_corpus: Callable, weights: Dict[str, float], + nlp: Language, dev_corpus: Callable, weights: Dict[str, float] ) -> Callable[[], Tuple[float, Dict[str, float]]]: def evaluate() -> Tuple[float, Dict[str, float]]: dev_examples = list(dev_corpus(nlp)) @@ -353,57 +355,6 @@ def subdivide_batch(batch, accumulate_gradient): yield subbatch -def setup_printer( - training: Union[Dict[str, Any], Config], nlp: Language -) -> Callable[[Dict[str, Any]], None]: - score_cols = list(training["score_weights"]) - score_widths = [max(len(col), 6) for col in score_cols] - loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] - loss_widths = [max(len(col), 8) for col in loss_cols] - table_header = ["E", "#"] + loss_cols + score_cols + ["Score"] - table_header = [col.upper() for col in table_header] - table_widths = [3, 6] + loss_widths + score_widths + [6] - table_aligns = ["r" for _ in table_widths] - msg.row(table_header, widths=table_widths) - msg.row(["-" * width for width in table_widths]) - - def print_row(info: Dict[str, Any]) -> None: - try: - losses = [ - "{0:.2f}".format(float(info["losses"][pipe_name])) - for pipe_name in nlp.pipe_names - ] - except KeyError as e: - raise KeyError( - Errors.E983.format( - dict="scores (losses)", key=str(e), keys=list(info["losses"].keys()) - ) - ) from None - - try: - scores = [ - "{0:.2f}".format(float(info["other_scores"].get(col, 0.0))) - for col in score_cols - ] - except KeyError as e: - raise KeyError( - Errors.E983.format( - dict="scores (other)", - key=str(e), - keys=list(info["other_scores"].keys()), - ) - ) from None - data = ( - [info["epoch"], info["step"]] - + losses - + scores - + ["{0:.2f}".format(float(info["score"]))] - ) - msg.row(data, widths=table_widths, aligns=table_aligns) - - return print_row - - def update_meta( training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] ) -> None: @@ -435,7 +386,7 @@ def load_from_paths( return raw_text, tag_map, morph_rules, weights_data -def verify_cli_args(config_path: Path, output_path: Optional[Path] = None,) -> None: +def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None: # Make sure all files and paths exists if they are needed if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) @@ -443,14 +394,6 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None,) -> N if not output_path.exists(): output_path.mkdir() msg.good(f"Created output directory: {output_path}") - elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: - msg.warn( - "Output directory is not empty.", - "This can lead to unintended side effects when saving the model. " - "Please use an empty directory or a different path instead. If " - "the specified output path doesn't exist, the directory will be " - "created for you.", - ) def verify_config(nlp: Language) -> None: diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 3eab21888..d76ef630d 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -11,6 +11,7 @@ use_pytorch_for_gpu_memory = false [nlp] lang = null pipeline = [] +disabled = [] load_vocab_data = true before_creation = null after_creation = null @@ -40,6 +41,9 @@ score_weights = {} # Names of pipeline components that shouldn't be updated during training frozen_components = [] +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" + [training.train_corpus] @readers = "spacy.Corpus.v1" path = ${paths.train} @@ -90,29 +94,3 @@ eps = 1e-8 warmup_steps = 250 total_steps = 20000 initial_rate = 0.001 - -[pretraining] -max_epochs = 1000 -min_length = 5 -max_length = 500 -dropout = 0.2 -n_save_every = null -batch_size = 3000 -seed = ${system.seed} -use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory} -tok2vec_model = "components.tok2vec.model" - -[pretraining.objective] -type = "characters" -n_characters = 4 - -[pretraining.optimizer] -@optimizers = "Adam.v1" -beta1 = 0.9 -beta2 = 0.999 -L2_is_weight_decay = true -L2 = 0.01 -grad_clip = 1.0 -use_averages = true -eps = 1e-8 -learn_rate = 0.001 diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg new file mode 100644 index 000000000..7032eac03 --- /dev/null +++ b/spacy/default_config_pretraining.cfg @@ -0,0 +1,25 @@ +[pretraining] +max_epochs = 1000 +min_length = 5 +max_length = 500 +dropout = 0.2 +n_save_every = null +batch_size = 3000 +seed = ${system.seed} +use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory} +tok2vec_model = "components.tok2vec.model" + +[pretraining.objective] +type = "characters" +n_characters = 4 + +[pretraining.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 diff --git a/spacy/errors.py b/spacy/errors.py index d1e9489d1..38c89c479 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -128,7 +128,8 @@ class Errors: "got {component} (name: '{name}'). If you're using a custom " "component factory, double-check that it correctly returns your " "initialized component.") - E004 = ("Can't set up pipeline component: a factory for '{name}' already exists.") + E004 = ("Can't set up pipeline component: a factory for '{name}' already " + "exists. Existing factory: {func}. New factory: {new_func}") E005 = ("Pipeline component '{name}' returned None. If you're using a " "custom component, maybe you forgot to return the processed Doc?") E006 = ("Invalid constraints for adding pipeline component. You can only " @@ -136,11 +137,10 @@ class Errors: "after (component name or index), first (True) or last (True). " "Invalid configuration: {args}. Existing components: {opts}") E007 = ("'{name}' already exists in pipeline. Existing names: {opts}") - E008 = ("Some current components would be lost when restoring previous " - "pipeline state. If you added components after calling " - "`nlp.select_pipes()`, you should remove them explicitly with " - "`nlp.remove_pipe()` before the pipeline is restored. Names of " - "the new components: {names}") + E008 = ("Can't restore disabled pipeline component '{name}' because it " + "doesn't exist in the pipeline anymore. If you want to remove " + "components from the pipeline, you should do it before calling " + "`nlp.select_pipes()` or after restoring the disabled components.") E010 = ("Word vectors set to length 0. This may be because you don't have " "a model installed or loaded, or because your model doesn't " "include word vectors. For more info, see the docs:\n" @@ -273,10 +273,6 @@ class Errors: "existing extension, set `force=True` on `{obj}.set_extension`.") E091 = ("Invalid extension attribute {name}: expected callable or None, " "but got: {value}") - E092 = ("Could not find or assign name for word vectors. Ususally, the " - "name is read from the model's meta.json in vector.name. " - "Alternatively, it is built from the 'lang' and 'name' keys in " - "the meta.json. Vector names are required to avoid issue #1660.") E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}") E094 = ("Error reading line {line_num} in vectors file {loc}.") E095 = ("Can't write to frozen dictionary. This is likely an internal " @@ -477,6 +473,13 @@ class Errors: E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E926 = ("It looks like you're trying to modify nlp.{attr} directly. This " + "doesn't work because it's an immutable computed property. If you " + "need to modify the pipeline, use the built-in methods like " + "nlp.add_pipe, nlp.remove_pipe, nlp.disable_pipe or nlp.enable_pipe " + "instead.") + E927 = ("Can't write to frozen list Maybe you're trying to modify a computed " + "property or default function argument?") E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the " "provided argument {loc} is an existing directory.") E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does " diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py index 4d71eae09..35e67f696 100644 --- a/spacy/gold/__init__.py +++ b/spacy/gold/__init__.py @@ -6,3 +6,4 @@ from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: from .iob_utils import spans_from_biluo_tags, tags_to_entities # noqa: F401 from .gold_io import docs_to_json, read_json_file # noqa: F401 from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401 +from .loggers import console_logger, wandb_logger # noqa: F401 diff --git a/spacy/gold/loggers.py b/spacy/gold/loggers.py new file mode 100644 index 000000000..e071e5827 --- /dev/null +++ b/spacy/gold/loggers.py @@ -0,0 +1,103 @@ +from typing import Dict, Any, Tuple, Callable, List + +from ..util import registry +from .. import util +from ..errors import Errors +from wasabi import msg + + +@registry.loggers("spacy.ConsoleLogger.v1") +def console_logger(): + def setup_printer( + nlp: "Language", + ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: + score_cols = list(nlp.config["training"]["score_weights"]) + score_widths = [max(len(col), 6) for col in score_cols] + loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] + loss_widths = [max(len(col), 8) for col in loss_cols] + table_header = ["E", "#"] + loss_cols + score_cols + ["Score"] + table_header = [col.upper() for col in table_header] + table_widths = [3, 6] + loss_widths + score_widths + [6] + table_aligns = ["r" for _ in table_widths] + msg.row(table_header, widths=table_widths) + msg.row(["-" * width for width in table_widths]) + + def log_step(info: Dict[str, Any]): + try: + losses = [ + "{0:.2f}".format(float(info["losses"][pipe_name])) + for pipe_name in nlp.pipe_names + ] + except KeyError as e: + raise KeyError( + Errors.E983.format( + dict="scores (losses)", + key=str(e), + keys=list(info["losses"].keys()), + ) + ) from None + + try: + scores = [ + "{0:.2f}".format(float(info["other_scores"].get(col, 0.0)) * 100) + for col in score_cols + ] + except KeyError as e: + raise KeyError( + Errors.E983.format( + dict="scores (other)", + key=str(e), + keys=list(info["other_scores"].keys()), + ) + ) from None + data = ( + [info["epoch"], info["step"]] + + losses + + scores + + ["{0:.2f}".format(float(info["score"]))] + ) + msg.row(data, widths=table_widths, aligns=table_aligns) + + def finalize(): + pass + + return log_step, finalize + + return setup_printer + + +@registry.loggers("spacy.WandbLogger.v1") +def wandb_logger(project_name: str, remove_config_values: List[str] = []): + import wandb + + console = console_logger() + + def setup_logger( + nlp: "Language", + ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: + config = nlp.config.interpolate() + config_dot = util.dict_to_dot(config) + for field in remove_config_values: + del config_dot[field] + config = util.dot_to_dict(config_dot) + wandb.init(project=project_name, config=config) + console_log_step, console_finalize = console(nlp) + + def log_step(info: Dict[str, Any]): + console_log_step(info) + score = info["score"] + other_scores = info["other_scores"] + losses = info["losses"] + wandb.log({"score": score}) + if losses: + wandb.log({f"loss_{k}": v for k, v in losses.items()}) + if isinstance(other_scores, dict): + wandb.log(other_scores) + + def finalize(): + console_finalize() + pass + + return log_step, finalize + + return setup_logger diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index a180fa6e9..7ddad9893 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -34,7 +34,7 @@ class Polish(Language): @Polish.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "lookup", "lookups": None}, + default_config={"model": None, "mode": "pos_lookup", "lookups": None}, scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index c4c6db06a..406ef9e4a 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -12,7 +12,7 @@ class PolishLemmatizer(Lemmatizer): @classmethod def get_lookups_config(cls, mode: str) -> Dict: - if mode == "lookup": + if mode == "pos_lookup": return { "required_tables": [ "lemma_lookup_adj", @@ -29,7 +29,7 @@ class PolishLemmatizer(Lemmatizer): else: return super().get_lookups_config(mode) - def lookup_lemmatize(self, token: Token) -> List[str]: + def pos_lookup_lemmatize(self, token: Token) -> List[str]: string = token.text univ_pos = token.pos_ morphology = token.morph.to_dict() diff --git a/spacy/language.py b/spacy/language.py index 57abcca0e..e20bbdd80 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -6,7 +6,7 @@ import itertools import weakref import functools from contextlib import contextmanager -from copy import copy, deepcopy +from copy import deepcopy from pathlib import Path import warnings from thinc.api import get_current_ops, Config, require_gpu, Optimizer @@ -20,7 +20,7 @@ from .vocab import Vocab, create_vocab from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .gold import Example, validate_examples from .scorer import Scorer -from .util import create_default_optimizer, registry +from .util import create_default_optimizer, registry, SimpleFrozenList from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES @@ -37,6 +37,9 @@ from . import about # This is the base config will all settings (training etc.) DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg" DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH) +# This is the base config for the [pretraining] block and currently not included +# in the main config and only added via the 'init fill-config' command +DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg" class BaseDefaults: @@ -156,7 +159,8 @@ class Language: self.vocab: Vocab = vocab if self.lang is None: self.lang = self.vocab.lang - self.pipeline = [] + self._components = [] + self._disabled = set() self.max_length = max_length self.resolved = {} # Create the default tokenizer from the default config @@ -203,10 +207,11 @@ class Language: "keys": self.vocab.vectors.n_keys, "name": self.vocab.vectors.name, } - self._meta["labels"] = self.pipe_labels + self._meta["labels"] = dict(self.pipe_labels) # TODO: Adding this back to prevent breaking people's code etc., but # we should consider removing it - self._meta["pipeline"] = self.pipe_names + self._meta["pipeline"] = list(self.pipe_names) + self._meta["disabled"] = list(self.disabled) return self._meta @meta.setter @@ -229,13 +234,14 @@ class Language: # we can populate the config again later pipeline = {} score_weights = [] - for pipe_name in self.pipe_names: + for pipe_name in self.component_names: pipe_meta = self.get_pipe_meta(pipe_name) pipe_config = self.get_pipe_config(pipe_name) pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config} if pipe_meta.default_score_weights: score_weights.append(pipe_meta.default_score_weights) - self._config["nlp"]["pipeline"] = self.pipe_names + self._config["nlp"]["pipeline"] = list(self.component_names) + self._config["nlp"]["disabled"] = list(self.disabled) self._config["components"] = pipeline self._config["training"]["score_weights"] = combine_score_weights(score_weights) if not srsly.is_json_serializable(self._config): @@ -246,21 +252,64 @@ class Language: def config(self, value: Config) -> None: self._config = value + @property + def disabled(self) -> List[str]: + """Get the names of all disabled components. + + RETURNS (List[str]): The disabled components. + """ + # Make sure the disabled components are returned in the order they + # appear in the pipeline (which isn't guaranteed by the set) + names = [name for name, _ in self._components if name in self._disabled] + return SimpleFrozenList(names, error=Errors.E926.format(attr="disabled")) + @property def factory_names(self) -> List[str]: """Get names of all available factories. RETURNS (List[str]): The factory names. """ - return list(self.factories.keys()) + names = list(self.factories.keys()) + return SimpleFrozenList(names) @property - def pipe_names(self) -> List[str]: - """Get names of available pipeline components. + def components(self) -> List[Tuple[str, Callable[[Doc], Doc]]]: + """Get all (name, component) tuples in the pipeline, including the + currently disabled components. + """ + return SimpleFrozenList( + self._components, error=Errors.E926.format(attr="components") + ) + + @property + def component_names(self) -> List[str]: + """Get the names of the available pipeline components. Includes all + active and inactive pipeline components. RETURNS (List[str]): List of component name strings, in order. """ - return [pipe_name for pipe_name, _ in self.pipeline] + names = [pipe_name for pipe_name, _ in self._components] + return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names")) + + @property + def pipeline(self) -> List[Tuple[str, Callable[[Doc], Doc]]]: + """The processing pipeline consisting of (name, component) tuples. The + components are called on the Doc in order as it passes through the + pipeline. + + RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline. + """ + pipes = [(n, p) for n, p in self._components if n not in self._disabled] + return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline")) + + @property + def pipe_names(self) -> List[str]: + """Get names of available active pipeline components. + + RETURNS (List[str]): List of component name strings, in order. + """ + names = [pipe_name for pipe_name, _ in self.pipeline] + return SimpleFrozenList(names, error=Errors.E926.format(attr="pipe_names")) @property def pipe_factories(self) -> Dict[str, str]: @@ -269,9 +318,9 @@ class Language: RETURNS (Dict[str, str]): Factory names, keyed by component names. """ factories = {} - for pipe_name, pipe in self.pipeline: + for pipe_name, pipe in self._components: factories[pipe_name] = self.get_pipe_meta(pipe_name).factory - return factories + return SimpleFrozenDict(factories) @property def pipe_labels(self) -> Dict[str, List[str]]: @@ -281,10 +330,10 @@ class Language: RETURNS (Dict[str, List[str]]): Labels keyed by component name. """ labels = {} - for name, pipe in self.pipeline: + for name, pipe in self._components: if hasattr(pipe, "labels"): labels[name] = list(pipe.labels) - return labels + return SimpleFrozenDict(labels) @classmethod def has_factory(cls, name: str) -> bool: @@ -355,10 +404,10 @@ class Language: name: str, *, default_config: Dict[str, Any] = SimpleFrozenDict(), - assigns: Iterable[str] = tuple(), - requires: Iterable[str] = tuple(), + assigns: Iterable[str] = SimpleFrozenList(), + requires: Iterable[str] = SimpleFrozenList(), retokenizes: bool = False, - scores: Iterable[str] = tuple(), + scores: Iterable[str] = SimpleFrozenList(), default_score_weights: Dict[str, float] = SimpleFrozenDict(), func: Optional[Callable] = None, ) -> Callable: @@ -393,13 +442,21 @@ class Language: style="default config", name=name, cfg_type=type(default_config) ) raise ValueError(err) - internal_name = cls.get_factory_name(name) - if internal_name in registry.factories: - # We only check for the internal name here – it's okay if it's a - # subclass and the base class has a factory of the same name - raise ValueError(Errors.E004.format(name=name)) def add_factory(factory_func: Callable) -> Callable: + internal_name = cls.get_factory_name(name) + if internal_name in registry.factories: + # We only check for the internal name here – it's okay if it's a + # subclass and the base class has a factory of the same name. We + # also only raise if the function is different to prevent raising + # if module is reloaded. + existing_func = registry.factories.get(internal_name) + if not util.is_same_func(factory_func, existing_func): + err = Errors.E004.format( + name=name, func=existing_func, new_func=factory_func + ) + raise ValueError(err) + arg_names = util.get_arg_names(factory_func) if "nlp" not in arg_names or "name" not in arg_names: raise ValueError(Errors.E964.format(name=name)) @@ -436,8 +493,8 @@ class Language: cls, name: Optional[str] = None, *, - assigns: Iterable[str] = tuple(), - requires: Iterable[str] = tuple(), + assigns: Iterable[str] = SimpleFrozenList(), + requires: Iterable[str] = SimpleFrozenList(), retokenizes: bool = False, func: Optional[Callable[[Doc], Doc]] = None, ) -> Callable: @@ -469,6 +526,21 @@ class Language: def factory_func(nlp: cls, name: str) -> Callable[[Doc], Doc]: return component_func + internal_name = cls.get_factory_name(name) + if internal_name in registry.factories: + # We only check for the internal name here – it's okay if it's a + # subclass and the base class has a factory of the same name. We + # also only raise if the function is different to prevent raising + # if module is reloaded. It's hacky, but we need to check the + # existing functure for a closure and whether that's identical + # to the component function (because factory_func created above + # will always be different, even for the same function) + existing_func = registry.factories.get(internal_name) + closure = existing_func.__closure__ + wrapped = [c.cell_contents for c in closure][0] if closure else None + if util.is_same_func(wrapped, component_func): + factory_func = existing_func # noqa: F811 + cls.factory( component_name, assigns=assigns, @@ -509,10 +581,10 @@ class Language: DOCS: https://spacy.io/api/language#get_pipe """ - for pipe_name, component in self.pipeline: + for pipe_name, component in self._components: if pipe_name == name: return component - raise KeyError(Errors.E001.format(name=name, opts=self.pipe_names)) + raise KeyError(Errors.E001.format(name=name, opts=self.component_names)) def create_pipe( self, @@ -657,8 +729,8 @@ class Language: err = Errors.E966.format(component=bad_val, name=name) raise ValueError(err) name = name if name is not None else factory_name - if name in self.pipe_names: - raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names)) + if name in self.component_names: + raise ValueError(Errors.E007.format(name=name, opts=self.component_names)) if source is not None: # We're loading the component from a model. After loading the # component, we know its real factory name @@ -683,7 +755,7 @@ class Language: ) pipe_index = self._get_pipe_index(before, after, first, last) self._pipe_meta[name] = self.get_factory_meta(factory_name) - self.pipeline.insert(pipe_index, (name, pipe_component)) + self._components.insert(pipe_index, (name, pipe_component)) return pipe_component def _get_pipe_index( @@ -704,32 +776,42 @@ class Language: """ all_args = {"before": before, "after": after, "first": first, "last": last} if sum(arg is not None for arg in [before, after, first, last]) >= 2: - raise ValueError(Errors.E006.format(args=all_args, opts=self.pipe_names)) + raise ValueError( + Errors.E006.format(args=all_args, opts=self.component_names) + ) if last or not any(value is not None for value in [first, before, after]): - return len(self.pipeline) + return len(self._components) elif first: return 0 elif isinstance(before, str): - if before not in self.pipe_names: - raise ValueError(Errors.E001.format(name=before, opts=self.pipe_names)) - return self.pipe_names.index(before) + if before not in self.component_names: + raise ValueError( + Errors.E001.format(name=before, opts=self.component_names) + ) + return self.component_names.index(before) elif isinstance(after, str): - if after not in self.pipe_names: - raise ValueError(Errors.E001.format(name=after, opts=self.pipe_names)) - return self.pipe_names.index(after) + 1 + if after not in self.component_names: + raise ValueError( + Errors.E001.format(name=after, opts=self.component_names) + ) + return self.component_names.index(after) + 1 # We're only accepting indices referring to components that exist # (can't just do isinstance here because bools are instance of int, too) elif type(before) == int: - if before >= len(self.pipeline) or before < 0: - err = Errors.E959.format(dir="before", idx=before, opts=self.pipe_names) + if before >= len(self._components) or before < 0: + err = Errors.E959.format( + dir="before", idx=before, opts=self.component_names + ) raise ValueError(err) return before elif type(after) == int: - if after >= len(self.pipeline) or after < 0: - err = Errors.E959.format(dir="after", idx=after, opts=self.pipe_names) + if after >= len(self._components) or after < 0: + err = Errors.E959.format( + dir="after", idx=after, opts=self.component_names + ) raise ValueError(err) return after + 1 - raise ValueError(Errors.E006.format(args=all_args, opts=self.pipe_names)) + raise ValueError(Errors.E006.format(args=all_args, opts=self.component_names)) def has_pipe(self, name: str) -> bool: """Check if a component name is present in the pipeline. Equivalent to @@ -770,7 +852,7 @@ class Language: # to Language.pipeline to make sure the configs are handled correctly pipe_index = self.pipe_names.index(name) self.remove_pipe(name) - if not len(self.pipeline) or pipe_index == len(self.pipeline): + if not len(self._components) or pipe_index == len(self._components): # we have no components to insert before/after, or we're replacing the last component self.add_pipe(factory_name, name=name, config=config, validate=validate) else: @@ -790,12 +872,16 @@ class Language: DOCS: https://spacy.io/api/language#rename_pipe """ - if old_name not in self.pipe_names: - raise ValueError(Errors.E001.format(name=old_name, opts=self.pipe_names)) - if new_name in self.pipe_names: - raise ValueError(Errors.E007.format(name=new_name, opts=self.pipe_names)) - i = self.pipe_names.index(old_name) - self.pipeline[i] = (new_name, self.pipeline[i][1]) + if old_name not in self.component_names: + raise ValueError( + Errors.E001.format(name=old_name, opts=self.component_names) + ) + if new_name in self.component_names: + raise ValueError( + Errors.E007.format(name=new_name, opts=self.component_names) + ) + i = self.component_names.index(old_name) + self._components[i] = (new_name, self._components[i][1]) self._pipe_meta[new_name] = self._pipe_meta.pop(old_name) self._pipe_configs[new_name] = self._pipe_configs.pop(old_name) @@ -807,20 +893,45 @@ class Language: DOCS: https://spacy.io/api/language#remove_pipe """ - if name not in self.pipe_names: - raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names)) - removed = self.pipeline.pop(self.pipe_names.index(name)) + if name not in self.component_names: + raise ValueError(Errors.E001.format(name=name, opts=self.component_names)) + removed = self._components.pop(self.component_names.index(name)) # We're only removing the component itself from the metas/configs here # because factory may be used for something else self._pipe_meta.pop(name) self._pipe_configs.pop(name) + # Make sure the name is also removed from the set of disabled components + if name in self.disabled: + self._disabled.remove(name) return removed + def disable_pipe(self, name: str) -> None: + """Disable a pipeline component. The component will still exist on + the nlp object, but it won't be run as part of the pipeline. Does + nothing if the component is already disabled. + + name (str): The name of the component to disable. + """ + if name not in self.component_names: + raise ValueError(Errors.E001.format(name=name, opts=self.component_names)) + self._disabled.add(name) + + def enable_pipe(self, name: str) -> None: + """Enable a previously disabled pipeline component so it's run as part + of the pipeline. Does nothing if the component is already enabled. + + name (str): The name of the component to enable. + """ + if name not in self.component_names: + raise ValueError(Errors.E001.format(name=name, opts=self.component_names)) + if name in self.disabled: + self._disabled.remove(name) + def __call__( self, text: str, *, - disable: Iterable[str] = tuple(), + disable: Iterable[str] = SimpleFrozenList(), component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, ) -> Doc: """Apply the pipeline to some text. The text can span multiple sentences, @@ -866,7 +977,7 @@ class Language: warnings.warn(Warnings.W096, DeprecationWarning) if len(names) == 1 and isinstance(names[0], (list, tuple)): names = names[0] # support list of names instead of spread - return DisabledPipes(self, names) + return self.select_pipes(disable=names) def select_pipes( self, @@ -919,7 +1030,7 @@ class Language: sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, - exclude: Iterable[str] = tuple(), + exclude: Iterable[str] = SimpleFrozenList(), ): """Update the models in the pipeline. @@ -973,7 +1084,7 @@ class Language: sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, - exclude: Iterable[str] = tuple(), + exclude: Iterable[str] = SimpleFrozenList(), ) -> Dict[str, float]: """Make a "rehearsal" update to the models in the pipeline, to prevent forgetting. Rehearsal updates run an initial copy of the model over some @@ -1202,7 +1313,7 @@ class Language: *, as_tuples: bool = False, batch_size: int = 1000, - disable: Iterable[str] = tuple(), + disable: Iterable[str] = SimpleFrozenList(), cleanup: bool = False, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, n_process: int = 1, @@ -1362,7 +1473,8 @@ class Language: config: Union[Dict[str, Any], Config] = {}, *, vocab: Union[Vocab, bool] = True, - disable: Iterable[str] = tuple(), + disable: Iterable[str] = SimpleFrozenList(), + exclude: Iterable[str] = SimpleFrozenList(), auto_fill: bool = True, validate: bool = True, ) -> "Language": @@ -1372,7 +1484,11 @@ class Language: config (Dict[str, Any] / Config): The loaded config. vocab (Vocab): A Vocab object. If True, a vocab is created. - disable (Iterable[str]): List of pipeline component names to disable. + disable (Iterable[str]): Names of pipeline components to disable. + Disabled pipes will be loaded but they won't be run unless you + explicitly enable them by calling nlp.enable_pipe. + exclude (Iterable[str]): Names of pipeline components to exclude. + Excluded components won't be loaded. auto_fill (bool): Automatically fill in missing values in config based on defaults and function argument annotations. validate (bool): Validate the component config and arguments against @@ -1445,7 +1561,7 @@ class Language: raise ValueError(Errors.E956.format(name=pipe_name, opts=opts)) pipe_cfg = util.copy_config(pipeline[pipe_name]) raw_config = Config(filled["components"][pipe_name]) - if pipe_name not in disable: + if pipe_name not in exclude: if "factory" not in pipe_cfg and "source" not in pipe_cfg: err = Errors.E984.format(name=pipe_name, config=pipe_cfg) raise ValueError(err) @@ -1470,6 +1586,8 @@ class Language: ) source_name = pipe_cfg.get("component", pipe_name) nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name) + disabled_pipes = [*config["nlp"]["disabled"], *disable] + nlp._disabled = set(p for p in disabled_pipes if p not in exclude) nlp.config = filled if auto_fill else config nlp.resolved = resolved if after_pipeline_creation is not None: @@ -1481,7 +1599,7 @@ class Language: return nlp def to_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: """Save the current state to a directory. If a model is loaded, this will include the model. @@ -1499,9 +1617,7 @@ class Language: ) serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta) serializers["config.cfg"] = lambda p: self.config.to_disk(p) - for name, proc in self.pipeline: - if not hasattr(proc, "name"): - continue + for name, proc in self._components: if name in exclude: continue if not hasattr(proc, "to_disk"): @@ -1511,7 +1627,7 @@ class Language: util.to_disk(path, serializers, exclude) def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> "Language": """Loads state from a directory. Modifies the object in place and returns it. If the saved `Language` object contains a model, the @@ -1535,18 +1651,19 @@ class Language: def deserialize_vocab(path: Path) -> None: if path.exists(): self.vocab.from_disk(path) - _fix_pretrained_vectors_name(self) path = util.ensure_path(path) deserializers = {} if Path(path / "config.cfg").exists(): - deserializers["config.cfg"] = lambda p: self.config.from_disk(p) + deserializers["config.cfg"] = lambda p: self.config.from_disk( + p, interpolate=False + ) deserializers["meta.json"] = deserialize_meta deserializers["vocab"] = deserialize_vocab deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( p, exclude=["vocab"] ) - for name, proc in self.pipeline: + for name, proc in self._components: if name in exclude: continue if not hasattr(proc, "from_disk"): @@ -1562,7 +1679,7 @@ class Language: self._link_components() return self - def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes: + def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: """Serialize the current state to a binary string. exclude (list): Names of components or serialization fields to exclude. @@ -1575,7 +1692,7 @@ class Language: serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) serializers["meta.json"] = lambda: srsly.json_dumps(self.meta) serializers["config.cfg"] = lambda: self.config.to_bytes() - for name, proc in self.pipeline: + for name, proc in self._components: if name in exclude: continue if not hasattr(proc, "to_bytes"): @@ -1584,7 +1701,7 @@ class Language: return util.to_bytes(serializers, exclude) def from_bytes( - self, bytes_data: bytes, *, exclude: Iterable[str] = tuple() + self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList() ) -> "Language": """Load state from a binary string. @@ -1602,18 +1719,16 @@ class Language: # from self.vocab.vectors, so set the name directly self.vocab.vectors.name = data.get("vectors", {}).get("name") - def deserialize_vocab(b): - self.vocab.from_bytes(b) - _fix_pretrained_vectors_name(self) - deserializers = {} - deserializers["config.cfg"] = lambda b: self.config.from_bytes(b) + deserializers["config.cfg"] = lambda b: self.config.from_bytes( + b, interpolate=False + ) deserializers["meta.json"] = deserialize_meta - deserializers["vocab"] = deserialize_vocab + deserializers["vocab"] = self.vocab.from_bytes deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( b, exclude=["vocab"] ) - for name, proc in self.pipeline: + for name, proc in self._components: if name in exclude: continue if not hasattr(proc, "from_bytes"): @@ -1643,39 +1758,16 @@ class FactoryMeta: default_score_weights: Optional[Dict[str, float]] = None # noqa: E704 -def _fix_pretrained_vectors_name(nlp: Language) -> None: - # TODO: Replace this once we handle vectors consistently as static - # data - if "vectors" in nlp.meta and "name" in nlp.meta["vectors"]: - nlp.vocab.vectors.name = nlp.meta["vectors"]["name"] - elif not nlp.vocab.vectors.size: - nlp.vocab.vectors.name = None - elif "name" in nlp.meta and "lang" in nlp.meta: - vectors_name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" - nlp.vocab.vectors.name = vectors_name - else: - raise ValueError(Errors.E092) - for name, proc in nlp.pipeline: - if not hasattr(proc, "cfg") or not isinstance(proc.cfg, dict): - continue - proc.cfg.setdefault("deprecation_fixes", {}) - proc.cfg["deprecation_fixes"]["vectors_name"] = nlp.vocab.vectors.name - - class DisabledPipes(list): """Manager for temporary pipeline disabling.""" def __init__(self, nlp: Language, names: List[str]) -> None: self.nlp = nlp self.names = names - # Important! Not deep copy -- we just want the container (but we also - # want to support people providing arbitrarily typed nlp.pipeline - # objects.) - self.original_pipeline = copy(nlp.pipeline) - self.metas = {name: nlp.get_pipe_meta(name) for name in names} - self.configs = {name: nlp.get_pipe_config(name) for name in names} + for name in self.names: + self.nlp.disable_pipe(name) list.__init__(self) - self.extend(nlp.remove_pipe(name) for name in names) + self.extend(self.names) def __enter__(self): return self @@ -1685,14 +1777,10 @@ class DisabledPipes(list): def restore(self) -> None: """Restore the pipeline to its state when DisabledPipes was created.""" - current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline - unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)] - if unexpected: - # Don't change the pipeline if we're raising an error. - self.nlp.pipeline = current - raise ValueError(Errors.E008.format(names=unexpected)) - self.nlp._pipe_meta.update(self.metas) - self.nlp._pipe_configs.update(self.configs) + for name in self.names: + if name not in self.nlp.component_names: + raise ValueError(Errors.E008.format(name=name)) + self.nlp.enable_pipe(name) self[:] = [] diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 44f125a04..8b542b7b9 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -47,7 +47,6 @@ def init(model, X=None, Y=None): def resize_output(model, new_nO): - tok2vec = model.get_ref("tok2vec") lower = model.get_ref("lower") upper = model.get_ref("upper") if not model.attrs["has_upper"]: diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index aba76664c..85a425e29 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -4,12 +4,15 @@ from pathlib import Path from .pipe import Pipe from ..errors import Errors +from ..gold import validate_examples from ..language import Language from ..matcher import Matcher -from ..symbols import IDS +from ..scorer import Scorer +from ..symbols import IDS, TAG, POS, MORPH, LEMMA from ..tokens import Doc, Span from ..tokens._retokenize import normalize_token_attrs, set_token_attrs from ..vocab import Vocab +from ..util import SimpleFrozenList from .. import util @@ -76,7 +79,7 @@ class AttributeRuler(Pipe): DOCS: https://spacy.io/api/attributeruler#call """ - matches = self.matcher(doc) + matches = sorted(self.matcher(doc)) for match_id, start, end in matches: span = Span(doc, start, end, label=match_id) @@ -192,7 +195,33 @@ class AttributeRuler(Pipe): all_patterns.append(p) return all_patterns - def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: + def score(self, examples, **kwargs): + """Score a batch of examples. + + examples (Iterable[Example]): The examples to score. + RETURNS (Dict[str, Any]): The scores, produced by + Scorer.score_token_attr for the attributes "tag", "pos", "morph" + and "lemma" for the target token attributes. + + DOCS: https://spacy.io/api/tagger#score + """ + validate_examples(examples, "AttributeRuler.score") + results = {} + attrs = set() + for token_attrs in self.attrs: + attrs.update(token_attrs) + for attr in attrs: + if attr == TAG: + results.update(Scorer.score_token_attr(examples, "tag", **kwargs)) + elif attr == POS: + results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) + elif attr == MORPH: + results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) + elif attr == LEMMA: + results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) + return results + + def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: """Serialize the AttributeRuler to a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -202,13 +231,12 @@ class AttributeRuler(Pipe): """ serialize = {} serialize["vocab"] = self.vocab.to_bytes - patterns = {k: self.matcher.get(k)[1] for k in range(len(self.attrs))} - serialize["patterns"] = lambda: srsly.msgpack_dumps(patterns) - serialize["attrs"] = lambda: srsly.msgpack_dumps(self.attrs) - serialize["indices"] = lambda: srsly.msgpack_dumps(self.indices) + serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()): + def from_bytes( + self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList() + ): """Load the AttributeRuler from a bytestring. bytes_data (bytes): The data to load. @@ -217,51 +245,35 @@ class AttributeRuler(Pipe): DOCS: https://spacy.io/api/attributeruler#from_bytes """ - data = {"patterns": b""} def load_patterns(b): - data["patterns"] = srsly.msgpack_loads(b) - - def load_attrs(b): - self.attrs = srsly.msgpack_loads(b) - - def load_indices(b): - self.indices = srsly.msgpack_loads(b) + self.add_patterns(srsly.msgpack_loads(b)) deserialize = { "vocab": lambda b: self.vocab.from_bytes(b), "patterns": load_patterns, - "attrs": load_attrs, - "indices": load_indices, } util.from_bytes(bytes_data, deserialize, exclude) - if data["patterns"]: - for key, pattern in data["patterns"].items(): - self.matcher.add(key, pattern) - assert len(self.attrs) == len(data["patterns"]) - assert len(self.indices) == len(data["patterns"]) - return self - def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()) -> None: + def to_disk( + self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList() + ) -> None: """Serialize the AttributeRuler to disk. path (Union[Path, str]): A path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. DOCS: https://spacy.io/api/attributeruler#to_disk """ - patterns = {k: self.matcher.get(k)[1] for k in range(len(self.attrs))} serialize = { "vocab": lambda p: self.vocab.to_disk(p), - "patterns": lambda p: srsly.write_msgpack(p, patterns), - "attrs": lambda p: srsly.write_msgpack(p, self.attrs), - "indices": lambda p: srsly.write_msgpack(p, self.indices), + "patterns": lambda p: srsly.write_msgpack(p, self.patterns), } util.to_disk(path, serialize, exclude) def from_disk( - self, path: Union[Path, str], exclude: Iterable[str] = tuple() + self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList() ) -> None: """Load the AttributeRuler from disk. @@ -269,31 +281,16 @@ class AttributeRuler(Pipe): exclude (Iterable[str]): String names of serialization fields to exclude. DOCS: https://spacy.io/api/attributeruler#from_disk """ - data = {"patterns": b""} def load_patterns(p): - data["patterns"] = srsly.read_msgpack(p) - - def load_attrs(p): - self.attrs = srsly.read_msgpack(p) - - def load_indices(p): - self.indices = srsly.read_msgpack(p) + self.add_patterns(srsly.read_msgpack(p)) deserialize = { "vocab": lambda p: self.vocab.from_disk(p), "patterns": load_patterns, - "attrs": load_attrs, - "indices": load_indices, } util.from_disk(path, deserialize, exclude) - if data["patterns"]: - for key, pattern in data["patterns"].items(): - self.matcher.add(key, pattern) - assert len(self.attrs) == len(data["patterns"]) - assert len(self.indices) == len(data["patterns"]) - return self diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index d92c700ba..c45cdce75 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -2,7 +2,7 @@ from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tu from pathlib import Path import srsly import random -from thinc.api import CosineDistance, get_array_module, Model, Optimizer, Config +from thinc.api import CosineDistance, Model, Optimizer, Config from thinc.api import set_dropout_rate import warnings @@ -13,6 +13,7 @@ from ..language import Language from ..vocab import Vocab from ..gold import Example, validate_examples from ..errors import Errors, Warnings +from ..util import SimpleFrozenList from .. import util @@ -404,7 +405,7 @@ class EntityLinker(Pipe): token.ent_kb_id_ = kb_id def to_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(), ) -> None: """Serialize the pipe to disk. @@ -421,7 +422,7 @@ class EntityLinker(Pipe): util.to_disk(path, serialize, exclude) def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(), ) -> "EntityLinker": """Load the pipe from disk. Modifies the object in place and returns it. diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 4f4e0fdd5..5137dfec2 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -5,7 +5,7 @@ import srsly from ..language import Language from ..errors import Errors -from ..util import ensure_path, to_disk, from_disk +from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher from ..scorer import Scorer @@ -68,7 +68,7 @@ class EntityRuler: ent_id_sep: str = DEFAULT_ENT_ID_SEP, patterns: Optional[List[PatternType]] = None, ) -> None: - """Initialize the entitiy ruler. If patterns are supplied here, they + """Initialize the entity ruler. If patterns are supplied here, they need to be a list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either be a token pattern (list) or a phrase pattern (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`. @@ -223,7 +223,7 @@ class EntityRuler: return all_patterns def add_patterns(self, patterns: List[PatternType]) -> None: - """Add patterns to the entitiy ruler. A pattern can either be a token + """Add patterns to the entity ruler. A pattern can either be a token pattern (list of dicts) or a phrase pattern (string). For example: {'label': 'ORG', 'pattern': 'Apple'} {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]} @@ -317,7 +317,7 @@ class EntityRuler: return Scorer.score_spans(examples, "ents", **kwargs) def from_bytes( - self, patterns_bytes: bytes, *, exclude: Iterable[str] = tuple() + self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() ) -> "EntityRuler": """Load the entity ruler from a bytestring. @@ -341,7 +341,7 @@ class EntityRuler: self.add_patterns(cfg) return self - def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes: + def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: """Serialize the entity ruler patterns to a bytestring. RETURNS (bytes): The serialized patterns. @@ -357,7 +357,7 @@ class EntityRuler: return srsly.msgpack_dumps(serial) def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> "EntityRuler": """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. @@ -394,7 +394,7 @@ class EntityRuler: return self def to_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: """Save the entity ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 2a4274597..af24bf336 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -275,13 +275,21 @@ class Tagger(Pipe): err = Errors.E930.format(name="Tagger", obj=type(get_examples)) raise ValueError(err) tags = set() + doc_sample = [] for example in get_examples(): for token in example.y: tags.add(token.tag_) + if len(doc_sample) < 10: + doc_sample.append(example.x) + if not doc_sample: + doc_sample.append(Doc(self.vocab, words=["hello"])) for tag in sorted(tags): self.add_label(tag) self.set_output(len(self.labels)) - self.model.initialize() + if self.labels: + self.model.initialize(X=doc_sample) + else: + self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd @@ -307,7 +315,7 @@ class Tagger(Pipe): examples (Iterable[Example]): The examples to score. RETURNS (Dict[str, Any]): The scores, produced by - Scorer.score_token_attr for the attributes "tag", "pos" and "lemma". + Scorer.score_token_attr for the attributes "tag". DOCS: https://spacy.io/api/tagger#score """ diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index f2d138cf7..dad66ddb3 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -295,4 +295,19 @@ def forward(model: Tok2VecListener, inputs, is_train: bool): model.verify_inputs(inputs) return model._outputs, model._backprop else: - return [doc.tensor for doc in inputs], lambda dX: [] + # This is pretty grim, but it's hard to do better :(. + # It's hard to avoid relying on the doc.tensor attribute, because the + # pipeline components can batch the data differently during prediction. + # That doesn't happen in update, where the nlp object works on batches + # of data. + # When the components batch differently, we don't receive a matching + # prediction from the upstream, so we can't predict. + if not all(doc.tensor.size for doc in inputs): + # But we do need to do *something* if the tensor hasn't been set. + # The compromise is to at least return data of the right shape, + # so the output is valid. + width = model.get_dim("nO") + outputs = [model.ops.alloc2f(len(doc), width) for doc in inputs] + else: + outputs = [doc.tensor for doc in inputs] + return outputs, lambda dX: [] diff --git a/spacy/schemas.py b/spacy/schemas.py index 170342b54..be8db6a99 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type +from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type, Tuple from typing import Iterable, TypeVar, TYPE_CHECKING from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator @@ -18,6 +18,7 @@ if TYPE_CHECKING: ItemT = TypeVar("ItemT") Batcher = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]] Reader = Callable[["Language", str], Iterable["Example"]] +Logger = Callable[["Language"], Tuple[Callable[[Dict[str, Any]], None], Callable]] def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]: @@ -63,7 +64,7 @@ class TokenPatternString(BaseModel): class Config: extra = "forbid" - @validator("*", pre=True, each_item=True) + @validator("*", pre=True, each_item=True, allow_reuse=True) def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") @@ -84,7 +85,7 @@ class TokenPatternNumber(BaseModel): class Config: extra = "forbid" - @validator("*", pre=True, each_item=True) + @validator("*", pre=True, each_item=True, allow_reuse=True) def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") @@ -145,7 +146,7 @@ class TokenPattern(BaseModel): allow_population_by_field_name = True alias_generator = lambda value: value.upper() - @validator("*", pre=True) + @validator("*", pre=True, allow_reuse=True) def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") @@ -209,6 +210,7 @@ class ConfigSchemaTraining(BaseModel): init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") raw_text: Optional[StrictStr] = Field(default=None, title="Raw text") optimizer: Optimizer = Field(..., title="The optimizer to use") + logger: Logger = Field(..., title="The logger to track training progress") frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") # fmt: on @@ -221,6 +223,7 @@ class ConfigSchemaNlp(BaseModel): # fmt: off lang: StrictStr = Field(..., title="The base language to use") pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") + disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default") tokenizer: Callable = Field(..., title="The tokenizer to use") load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data") before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization") @@ -233,6 +236,11 @@ class ConfigSchemaNlp(BaseModel): arbitrary_types_allowed = True +class ConfigSchemaPretrainEmpty(BaseModel): + class Config: + extra = "forbid" + + class ConfigSchemaPretrain(BaseModel): # fmt: off max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for") @@ -257,16 +265,17 @@ class ConfigSchemaPretrain(BaseModel): class ConfigSchema(BaseModel): training: ConfigSchemaTraining nlp: ConfigSchemaNlp - pretraining: Optional[ConfigSchemaPretrain] + pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} components: Dict[str, Dict[str, Any]] - @root_validator + @root_validator(allow_reuse=True) def validate_config(cls, values): """Perform additional validation for settings with dependencies.""" pt = values.get("pretraining") - if pt and pt.objective.get("type") == "vectors" and not values["nlp"].vectors: - err = "Need nlp.vectors if pretraining.objective.type is vectors" - raise ValueError(err) + if pt and not isinstance(pt, ConfigSchemaPretrainEmpty): + if pt.objective.get("type") == "vectors" and not values["nlp"].vectors: + err = "Need nlp.vectors if pretraining.objective.type is vectors" + raise ValueError(err) return values class Config: @@ -277,11 +286,28 @@ class ConfigSchema(BaseModel): # Project config Schema -class ProjectConfigAsset(BaseModel): +class ProjectConfigAssetGitItem(BaseModel): + # fmt: off + repo: StrictStr = Field(..., title="URL of Git repo to download from") + path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)") + branch: StrictStr = Field("master", title="Branch to clone from") + # fmt: on + + +class ProjectConfigAssetURL(BaseModel): # fmt: off dest: StrictStr = Field(..., title="Destination of downloaded asset") url: Optional[StrictStr] = Field(None, title="URL of asset") checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") + description: StrictStr = Field("", title="Description of asset") + # fmt: on + + +class ProjectConfigAssetGit(BaseModel): + # fmt: off + git: ProjectConfigAssetGitItem = Field(..., title="Git repo information") + checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") + description: Optional[StrictStr] = Field(None, title="Description of asset") # fmt: on @@ -304,9 +330,10 @@ class ProjectConfigCommand(BaseModel): class ProjectConfigSchema(BaseModel): # fmt: off vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands") - assets: List[ProjectConfigAsset] = Field([], title="Data assets") + assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets") workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") + title: Optional[str] = Field(None, title="Project title") # fmt: on class Config: diff --git a/spacy/scorer.py b/spacy/scorer.py index dc017f82f..9bbc64cac 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,10 +1,10 @@ -from typing import Optional, Iterable, Dict, Any, Callable, Tuple, TYPE_CHECKING +from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING import numpy as np from .gold import Example from .tokens import Token, Doc, Span from .errors import Errors -from .util import get_lang_class +from .util import get_lang_class, SimpleFrozenList from .morphology import Morphology if TYPE_CHECKING: @@ -317,7 +317,7 @@ class Scorer: attr: str, *, getter: Callable[[Doc, str], Any] = getattr, - labels: Iterable[str] = tuple(), + labels: Iterable[str] = SimpleFrozenList(), multi_label: bool = True, positive_label: Optional[str] = None, threshold: Optional[float] = None, @@ -413,6 +413,7 @@ class Scorer: macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats + macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats results = { f"{attr}_score": None, f"{attr}_score_desc": None, @@ -422,7 +423,7 @@ class Scorer: f"{attr}_macro_p": macro_p, f"{attr}_macro_r": macro_r, f"{attr}_macro_f": macro_f, - f"{attr}_macro_auc": None, + f"{attr}_macro_auc": macro_auc, f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, } @@ -446,7 +447,7 @@ class Scorer: getter: Callable[[Token, str], Any] = getattr, head_attr: str = "head", head_getter: Callable[[Token, str], Token] = getattr, - ignore_labels: Tuple[str] = tuple(), + ignore_labels: Iterable[str] = SimpleFrozenList(), **cfg, ) -> Dict[str, Any]: """Returns the UAS, LAS, and LAS per type scores for dependency diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 28bbc9fc3..92607e120 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -472,7 +472,6 @@ def sort_nums(x): return x[1] -PRON_LEMMA = "-PRON-" NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)] # Unfortunate hack here, to work around problem with long cpdef enum # (which is generating an enormous amount of C++ in Cython 0.24+) diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index bcde7bf63..96361a693 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -1,5 +1,6 @@ import pytest import numpy +from spacy.gold import Example from spacy.lang.en import English from spacy.pipeline import AttributeRuler from spacy import util, registry @@ -94,6 +95,43 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): assert doc[3].morph_ == "Case=Nom|Number=Sing" +def test_attributeruler_score(nlp, pattern_dicts): + # initialize with patterns + nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) + doc = nlp("This is a test.") + assert doc[2].lemma_ == "the" + assert doc[2].morph_ == "Case=Nom|Number=Plur" + assert doc[3].lemma_ == "cat" + assert doc[3].morph_ == "Case=Nom|Number=Sing" + + dev_examples = [ + Example.from_dict( + nlp.make_doc("This is a test."), {"lemmas": ["this", "is", "a", "cat", "."]} + ) + ] + scores = nlp.evaluate(dev_examples) + # "cat" is the only correct lemma + assert scores["lemma_acc"] == pytest.approx(0.2) + # the empty morphs are correct + assert scores["morph_acc"] == pytest.approx(0.6) + + +def test_attributeruler_rule_order(nlp): + a = AttributeRuler(nlp.vocab) + patterns = [ + {"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "VERB"}}, + {"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "NOUN"}}, + ] + a.add_patterns(patterns) + doc = get_doc( + nlp.vocab, + words=["This", "is", "a", "test", "."], + tags=["DT", "VBZ", "DT", "NN", "."], + ) + doc = a(doc) + assert doc[1].pos_ == "NOUN" + + def test_attributeruler_tag_map(nlp, tag_map): a = AttributeRuler(nlp.vocab) a.load_from_tag_map(tag_map) @@ -197,6 +235,7 @@ def test_attributeruler_serialize(nlp, pattern_dicts): assert a.to_bytes() == a_reloaded.to_bytes() doc1 = a_reloaded(nlp.make_doc(text)) numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs)) + assert a.patterns == a_reloaded.patterns # disk roundtrip with make_tempdir() as tmp_dir: @@ -205,3 +244,4 @@ def test_attributeruler_serialize(nlp, pattern_dicts): doc2 = nlp2(text) assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes() assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs)) + assert a.patterns == nlp2.get_pipe("attribute_ruler").patterns diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index aa682fefe..f75c9ec8c 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -438,3 +438,26 @@ def test_pipe_factories_from_source_config(): config = nlp.config["components"]["custom"] assert config["factory"] == name assert config["arg"] == "world" + + +def test_pipe_factories_decorator_idempotent(): + """Check that decorator can be run multiple times if the function is the + same. This is especially relevant for live reloading because we don't + want spaCy to raise an error if a module registering components is reloaded. + """ + name = "test_pipe_factories_decorator_idempotent" + func = lambda nlp, name: lambda doc: doc + for i in range(5): + Language.factory(name, func=func) + nlp = Language() + nlp.add_pipe(name) + Language.factory(name, func=func) + # Make sure it also works for component decorator, which creates the + # factory function + name2 = f"{name}2" + func2 = lambda doc: doc + for i in range(5): + Language.component(name2, func=func2) + nlp = Language() + nlp.add_pipe(name) + Language.component(name2, func=func2) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index feb11cabc..ea09d990c 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,5 +1,6 @@ import pytest from spacy.language import Language +from spacy.util import SimpleFrozenList @pytest.fixture @@ -81,9 +82,9 @@ def test_replace_last_pipe(nlp): def test_replace_pipe_config(nlp): nlp.add_pipe("entity_linker") nlp.add_pipe("sentencizer") - assert nlp.get_pipe("entity_linker").cfg["incl_prior"] == True + assert nlp.get_pipe("entity_linker").cfg["incl_prior"] is True nlp.replace_pipe("entity_linker", "entity_linker", config={"incl_prior": False}) - assert nlp.get_pipe("entity_linker").cfg["incl_prior"] == False + assert nlp.get_pipe("entity_linker").cfg["incl_prior"] is False @pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")]) @@ -181,6 +182,11 @@ def test_select_pipes_errors(nlp): with pytest.raises(ValueError): nlp.select_pipes(enable=[], disable=["c3"]) + disabled = nlp.select_pipes(disable=["c2"]) + nlp.remove_pipe("c2") + with pytest.raises(ValueError): + disabled.restore() + @pytest.mark.parametrize("n_pipes", [100]) def test_add_lots_of_pipes(nlp, n_pipes): @@ -249,3 +255,94 @@ def test_add_pipe_before_after(): nlp.add_pipe("entity_ruler", before=True) with pytest.raises(ValueError): nlp.add_pipe("entity_ruler", first=False) + + +def test_disable_enable_pipes(): + name = "test_disable_enable_pipes" + results = {} + + def make_component(name): + results[name] = "" + + def component(doc): + nonlocal results + results[name] = doc.text + return doc + + return component + + c1 = Language.component(f"{name}1", func=make_component(f"{name}1")) + c2 = Language.component(f"{name}2", func=make_component(f"{name}2")) + + nlp = Language() + nlp.add_pipe(f"{name}1") + nlp.add_pipe(f"{name}2") + assert results[f"{name}1"] == "" + assert results[f"{name}2"] == "" + assert nlp.pipeline == [(f"{name}1", c1), (f"{name}2", c2)] + assert nlp.pipe_names == [f"{name}1", f"{name}2"] + nlp.disable_pipe(f"{name}1") + assert nlp.disabled == [f"{name}1"] + assert nlp.component_names == [f"{name}1", f"{name}2"] + assert nlp.pipe_names == [f"{name}2"] + assert nlp.config["nlp"]["disabled"] == [f"{name}1"] + nlp("hello") + assert results[f"{name}1"] == "" # didn't run + assert results[f"{name}2"] == "hello" # ran + nlp.enable_pipe(f"{name}1") + assert nlp.disabled == [] + assert nlp.pipe_names == [f"{name}1", f"{name}2"] + assert nlp.config["nlp"]["disabled"] == [] + nlp("world") + assert results[f"{name}1"] == "world" + assert results[f"{name}2"] == "world" + nlp.disable_pipe(f"{name}2") + nlp.remove_pipe(f"{name}2") + assert nlp.components == [(f"{name}1", c1)] + assert nlp.pipeline == [(f"{name}1", c1)] + assert nlp.component_names == [f"{name}1"] + assert nlp.pipe_names == [f"{name}1"] + assert nlp.disabled == [] + assert nlp.config["nlp"]["disabled"] == [] + nlp.rename_pipe(f"{name}1", name) + assert nlp.components == [(name, c1)] + assert nlp.component_names == [name] + nlp("!") + assert results[f"{name}1"] == "!" + assert results[f"{name}2"] == "world" + with pytest.raises(ValueError): + nlp.disable_pipe(f"{name}2") + nlp.disable_pipe(name) + assert nlp.component_names == [name] + assert nlp.pipe_names == [] + assert nlp.config["nlp"]["disabled"] == [name] + nlp("?") + assert results[f"{name}1"] == "!" + + +def test_pipe_methods_frozen(): + """Test that spaCy raises custom error messages if "frozen" properties are + accessed. We still want to use a list here to not break backwards + compatibility, but users should see an error if they're trying to append + to nlp.pipeline etc.""" + nlp = Language() + ner = nlp.add_pipe("ner") + assert nlp.pipe_names == ["ner"] + for prop in [ + nlp.pipeline, + nlp.pipe_names, + nlp.components, + nlp.component_names, + nlp.disabled, + nlp.factory_names, + ]: + assert isinstance(prop, list) + assert isinstance(prop, SimpleFrozenList) + with pytest.raises(NotImplementedError): + nlp.pipeline.append(("ner2", ner)) + with pytest.raises(NotImplementedError): + nlp.pipe_names.pop() + with pytest.raises(NotImplementedError): + nlp.components.sort() + with pytest.raises(NotImplementedError): + nlp.component_names.clear() diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index d16ecc1e6..39533f70a 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -161,6 +161,7 @@ def test_issue4674(): assert kb2.get_size_entities() == 1 +@pytest.mark.skip(reason="API change: disable just disables, new exclude arg") def test_issue4707(): """Tests that disabled component names are also excluded from nlp.from_disk by default when loading a model. diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index f2b496d71..e425d370d 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -3,10 +3,11 @@ from thinc.config import Config, ConfigValidationError import spacy from spacy.lang.en import English from spacy.lang.de import German -from spacy.language import Language +from spacy.language import Language, DEFAULT_CONFIG from spacy.util import registry, load_model_from_config from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder +from spacy.schemas import ConfigSchema from ..util import make_tempdir @@ -208,6 +209,20 @@ def test_config_nlp_roundtrip(): assert new_nlp._factory_meta == nlp._factory_meta +def test_config_nlp_roundtrip_bytes_disk(): + """Test that the config is serialized correctly and not interpolated + by mistake.""" + nlp = English() + nlp_bytes = nlp.to_bytes() + new_nlp = English().from_bytes(nlp_bytes) + assert new_nlp.config == nlp.config + nlp = English() + with make_tempdir() as d: + nlp.to_disk(d) + new_nlp = spacy.load(d) + assert new_nlp.config == nlp.config + + def test_serialize_config_language_specific(): """Test that config serialization works as expected with language-specific factories.""" @@ -299,3 +314,26 @@ def test_config_interpolation(): nlp2 = English.from_config(interpolated) assert nlp2.config["training"]["train_corpus"]["path"] == "" assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 + + +def test_config_optional_sections(): + config = Config().from_str(nlp_config_string) + config = DEFAULT_CONFIG.merge(config) + assert "pretraining" not in config + filled = registry.fill_config(config, schema=ConfigSchema, validate=False) + # Make sure that optional "pretraining" block doesn't default to None, + # which would (rightly) cause error because it'd result in a top-level + # key that's not a section (dict). Note that the following roundtrip is + # also how Config.interpolate works under the hood. + new_config = Config().from_str(filled.to_str()) + assert new_config["pretraining"] == {} + + +def test_config_auto_fill_extra_fields(): + config = Config({"nlp": {"lang": "en"}, "training": {}}) + assert load_model_from_config(config, auto_fill=True) + config = Config({"nlp": {"lang": "en"}, "training": {"extra": "hello"}}) + nlp, _ = load_model_from_config(config, auto_fill=True, validate=False) + assert "extra" not in nlp.config["training"] + # Make sure the config generated is valid + load_model_from_config(nlp.config) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 7ba4815ee..db62f6569 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -6,6 +6,8 @@ from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL from spacy.pipeline.senter import DEFAULT_SENTER_MODEL +from spacy.lang.en import English +import spacy from ..util import make_tempdir @@ -173,3 +175,34 @@ def test_serialize_sentencerecognizer(en_vocab): sr_b = sr.to_bytes() sr_d = SentenceRecognizer(en_vocab, model).from_bytes(sr_b) assert sr.to_bytes() == sr_d.to_bytes() + + +def test_serialize_pipeline_disable_enable(): + nlp = English() + nlp.add_pipe("ner") + nlp.add_pipe("tagger") + nlp.disable_pipe("tagger") + assert nlp.config["nlp"]["disabled"] == ["tagger"] + config = nlp.config.copy() + nlp2 = English.from_config(config) + assert nlp2.pipe_names == ["ner"] + assert nlp2.component_names == ["ner", "tagger"] + assert nlp2.disabled == ["tagger"] + assert nlp2.config["nlp"]["disabled"] == ["tagger"] + with make_tempdir() as d: + nlp2.to_disk(d) + nlp3 = spacy.load(d) + assert nlp3.pipe_names == ["ner"] + assert nlp3.component_names == ["ner", "tagger"] + with make_tempdir() as d: + nlp3.to_disk(d) + nlp4 = spacy.load(d, disable=["ner"]) + assert nlp4.pipe_names == [] + assert nlp4.component_names == ["ner", "tagger"] + assert nlp4.disabled == ["ner", "tagger"] + with make_tempdir() as d: + nlp.to_disk(d) + nlp5 = spacy.load(d, exclude=["tagger"]) + assert nlp5.pipe_names == ["ner"] + assert nlp5.component_names == ["ner"] + assert nlp5.disabled == [] diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 104c7c516..aa8ea6051 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,4 +1,6 @@ import pytest +from click import NoSuchOption + from spacy.gold import docs_to_json, biluo_tags_from_offsets from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.lang.en import English @@ -270,6 +272,41 @@ def test_pretrain_make_docs(): assert skip_count == 0 +def test_project_config_validation_full(): + config = { + "vars": {"some_var": 20}, + "directories": ["assets", "configs", "corpus", "scripts", "training"], + "assets": [ + { + "dest": "x", + "url": "https://example.com", + "checksum": "63373dd656daa1fd3043ce166a59474c", + }, + { + "dest": "y", + "git": { + "repo": "https://github.com/example/repo", + "branch": "develop", + "path": "y", + }, + }, + ], + "commands": [ + { + "name": "train", + "help": "Train a model", + "script": ["python -m spacy train config.cfg -o training"], + "deps": ["config.cfg", "corpus/training.spcy"], + "outputs": ["training/model-best"], + }, + {"name": "test", "script": ["pytest", "custom.py"], "no_skip": True}, + ], + "workflows": {"all": ["train", "test"], "train": ["train"]}, + } + errors = validate(ProjectConfigSchema, config) + assert not errors + + @pytest.mark.parametrize( "config", [ @@ -336,10 +373,17 @@ def test_parse_config_overrides(args, expected): @pytest.mark.parametrize( - "args", - [["--foo"], ["--x.foo", "bar", "--baz"], ["--x.foo", "bar", "baz"], ["x.foo"]], + "args", [["--foo"], ["--x.foo", "bar", "--baz"]], ) def test_parse_config_overrides_invalid(args): + with pytest.raises(NoSuchOption): + parse_config_overrides(args) + + +@pytest.mark.parametrize( + "args", [["--x.foo", "bar", "baz"], ["x.foo"]], +) +def test_parse_config_overrides_invalid_2(args): with pytest.raises(SystemExit): parse_config_overrides(args) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 47111a902..40cd71eb5 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -3,10 +3,9 @@ import pytest from .util import get_random_doc from spacy import util -from spacy.util import dot_to_object +from spacy.util import dot_to_object, SimpleFrozenList from thinc.api import Config, Optimizer from spacy.gold.batchers import minibatch_by_words - from ..lang.en import English from ..lang.nl import Dutch from ..language import DEFAULT_CONFIG_PATH @@ -106,3 +105,20 @@ def test_util_dot_section(): assert not dot_to_object(en_config, "nlp.load_vocab_data") assert dot_to_object(nl_config, "nlp.load_vocab_data") assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer) + + +def test_simple_frozen_list(): + t = SimpleFrozenList(["foo", "bar"]) + assert t == ["foo", "bar"] + assert t.index("bar") == 1 # okay method + with pytest.raises(NotImplementedError): + t.append("baz") + with pytest.raises(NotImplementedError): + t.sort() + with pytest.raises(NotImplementedError): + t.extend(["baz"]) + with pytest.raises(NotImplementedError): + t.pop() + t = SimpleFrozenList(["foo", "bar"], error="Error!") + with pytest.raises(NotImplementedError): + t.append("baz") diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 9d17cec1c..a257c7919 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -10,7 +10,7 @@ from ..vocab import Vocab from ..compat import copy_reg from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors -from ..util import ensure_path +from ..util import ensure_path, SimpleFrozenList # fmt: off ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS") @@ -52,7 +52,7 @@ class DocBin: self, attrs: Iterable[str] = ALL_ATTRS, store_user_data: bool = False, - docs: Iterable[Doc] = tuple(), + docs: Iterable[Doc] = SimpleFrozenList(), ) -> None: """Create a DocBin object to hold serialized annotations. diff --git a/spacy/util.py b/spacy/util.py index 736f4d805..0eb76c3d1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -81,6 +81,7 @@ class registry(thinc.registry): callbacks = catalogue.create("spacy", "callbacks") batchers = catalogue.create("spacy", "batchers", entry_points=True) readers = catalogue.create("spacy", "readers", entry_points=True) + loggers = catalogue.create("spacy", "loggers", entry_points=True) # These are factories registered via third-party packages and the # spacy_factories entry point. This registry only exists so we can easily # load them via the entry points. The "true" factories are added via the @@ -119,6 +120,47 @@ class SimpleFrozenDict(dict): raise NotImplementedError(self.error) +class SimpleFrozenList(list): + """Wrapper class around a list that lets us raise custom errors if certain + attributes/methods are accessed. Mostly used for properties like + Language.pipeline that return an immutable list (and that we don't want to + convert to a tuple to not break too much backwards compatibility). If a user + accidentally calls nlp.pipeline.append(), we can raise a more helpful error. + """ + + def __init__(self, *args, error: str = Errors.E927) -> None: + """Initialize the frozen list. + + error (str): The error message when user tries to mutate the list. + """ + self.error = error + super().__init__(*args) + + def append(self, *args, **kwargs): + raise NotImplementedError(self.error) + + def clear(self, *args, **kwargs): + raise NotImplementedError(self.error) + + def extend(self, *args, **kwargs): + raise NotImplementedError(self.error) + + def insert(self, *args, **kwargs): + raise NotImplementedError(self.error) + + def pop(self, *args, **kwargs): + raise NotImplementedError(self.error) + + def remove(self, *args, **kwargs): + raise NotImplementedError(self.error) + + def reverse(self, *args, **kwargs): + raise NotImplementedError(self.error) + + def sort(self, *args, **kwargs): + raise NotImplementedError(self.error) + + def lang_class_is_loaded(lang: str) -> bool: """Check whether a Language class is already loaded. Language classes are loaded lazily, to avoid expensive setup code associated with the language @@ -214,7 +256,8 @@ def load_model( name: Union[str, Path], *, vocab: Union["Vocab", bool] = True, - disable: Iterable[str] = tuple(), + disable: Iterable[str] = SimpleFrozenList(), + exclude: Iterable[str] = SimpleFrozenList(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Load a model from a package or data path. @@ -227,7 +270,7 @@ def load_model( keyed by section values in dot notation. RETURNS (Language): The loaded nlp object. """ - kwargs = {"vocab": vocab, "disable": disable, "config": config} + kwargs = {"vocab": vocab, "disable": disable, "exclude": exclude, "config": config} if isinstance(name, str): # name or string path if name.startswith("blank:"): # shortcut for blank model return get_lang_class(name.replace("blank:", ""))() @@ -246,7 +289,8 @@ def load_model_from_package( name: str, *, vocab: Union["Vocab", bool] = True, - disable: Iterable[str] = tuple(), + disable: Iterable[str] = SimpleFrozenList(), + exclude: Iterable[str] = SimpleFrozenList(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Load a model from an installed package. @@ -254,13 +298,17 @@ def load_model_from_package( name (str): The package name. vocab (Vocab / True): Optional vocab to pass in on initialization. If True, a new Vocab object will be created. - disable (Iterable[str]): Names of pipeline components to disable. + disable (Iterable[str]): Names of pipeline components to disable. Disabled + pipes will be loaded but they won't be run unless you explicitly + enable them by calling nlp.enable_pipe. + exclude (Iterable[str]): Names of pipeline components to exclude. Excluded + components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. RETURNS (Language): The loaded nlp object. """ cls = importlib.import_module(name) - return cls.load(vocab=vocab, disable=disable, config=config) + return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config) def load_model_from_path( @@ -268,7 +316,8 @@ def load_model_from_path( *, meta: Optional[Dict[str, Any]] = None, vocab: Union["Vocab", bool] = True, - disable: Iterable[str] = tuple(), + disable: Iterable[str] = SimpleFrozenList(), + exclude: Iterable[str] = SimpleFrozenList(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Load a model from a data directory path. Creates Language class with @@ -278,7 +327,11 @@ def load_model_from_path( meta (Dict[str, Any]): Optional model meta. vocab (Vocab / True): Optional vocab to pass in on initialization. If True, a new Vocab object will be created. - disable (Iterable[str]): Names of pipeline components to disable. + disable (Iterable[str]): Names of pipeline components to disable. Disabled + pipes will be loaded but they won't be run unless you explicitly + enable them by calling nlp.enable_pipe. + exclude (Iterable[str]): Names of pipeline components to exclude. Excluded + components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. RETURNS (Language): The loaded nlp object. @@ -289,15 +342,18 @@ def load_model_from_path( meta = get_model_meta(model_path) config_path = model_path / "config.cfg" config = load_config(config_path, overrides=dict_to_dot(config)) - nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable) - return nlp.from_disk(model_path, exclude=disable) + nlp, _ = load_model_from_config( + config, vocab=vocab, disable=disable, exclude=exclude + ) + return nlp.from_disk(model_path, exclude=exclude) def load_model_from_config( config: Union[Dict[str, Any], Config], *, vocab: Union["Vocab", bool] = True, - disable: Iterable[str] = tuple(), + disable: Iterable[str] = SimpleFrozenList(), + exclude: Iterable[str] = SimpleFrozenList(), auto_fill: bool = False, validate: bool = True, ) -> Tuple["Language", Config]: @@ -308,7 +364,11 @@ def load_model_from_config( meta (Dict[str, Any]): Optional model meta. vocab (Vocab / True): Optional vocab to pass in on initialization. If True, a new Vocab object will be created. - disable (Iterable[str]): Names of pipeline components to disable. + disable (Iterable[str]): Names of pipeline components to disable. Disabled + pipes will be loaded but they won't be run unless you explicitly + enable them by calling nlp.enable_pipe. + exclude (Iterable[str]): Names of pipeline components to exclude. Excluded + components won't be loaded. auto_fill (bool): Whether to auto-fill config with missing defaults. validate (bool): Whether to show config validation errors. RETURNS (Language): The loaded nlp object. @@ -322,7 +382,12 @@ def load_model_from_config( # registry, including custom subclasses provided via entry points lang_cls = get_lang_class(nlp_config["lang"]) nlp = lang_cls.from_config( - config, vocab=vocab, disable=disable, auto_fill=auto_fill, validate=validate, + config, + vocab=vocab, + disable=disable, + exclude=exclude, + auto_fill=auto_fill, + validate=validate, ) return nlp, nlp.resolved @@ -331,7 +396,8 @@ def load_model_from_init_py( init_file: Union[Path, str], *, vocab: Union["Vocab", bool] = True, - disable: Iterable[str] = tuple(), + disable: Iterable[str] = SimpleFrozenList(), + exclude: Iterable[str] = SimpleFrozenList(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Helper function to use in the `load()` method of a model package's @@ -339,7 +405,11 @@ def load_model_from_init_py( vocab (Vocab / True): Optional vocab to pass in on initialization. If True, a new Vocab object will be created. - disable (Iterable[str]): Names of pipeline components to disable. + disable (Iterable[str]): Names of pipeline components to disable. Disabled + pipes will be loaded but they won't be run unless you explicitly + enable them by calling nlp.enable_pipe. + exclude (Iterable[str]): Names of pipeline components to exclude. Excluded + components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. RETURNS (Language): The loaded nlp object. @@ -351,7 +421,12 @@ def load_model_from_init_py( if not model_path.exists(): raise IOError(Errors.E052.format(path=data_path)) return load_model_from_path( - data_path, vocab=vocab, meta=meta, disable=disable, config=config + data_path, + vocab=vocab, + meta=meta, + disable=disable, + exclude=exclude, + config=config, ) @@ -572,7 +647,7 @@ def join_command(command: List[str]) -> str: return " ".join(shlex.quote(cmd) for cmd in command) -def run_command(command: Union[str, List[str]]) -> None: +def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> None: """Run a command on the command line as a subprocess. If the subprocess returns a non-zero exit code, a system exit is performed. @@ -582,13 +657,22 @@ def run_command(command: Union[str, List[str]]) -> None: if isinstance(command, str): command = split_command(command) try: - status = subprocess.call(command, env=os.environ.copy()) + ret = subprocess.run( + command, + env=os.environ.copy(), + input=stdin, + encoding="utf8", + check=True, + stdout=subprocess.PIPE if capture else None, + stderr=subprocess.PIPE if capture else None, + ) except FileNotFoundError: raise FileNotFoundError( Errors.E970.format(str_command=" ".join(command), tool=command[0]) ) from None - if status != 0: - sys.exit(status) + if ret.returncode != 0: + sys.exit(ret.returncode) + return ret @contextmanager @@ -663,6 +747,25 @@ def get_object_name(obj: Any) -> str: return repr(obj) +def is_same_func(func1: Callable, func2: Callable) -> bool: + """Approximately decide whether two functions are the same, even if their + identity is different (e.g. after they have been live reloaded). Mostly + used in the @Language.component and @Language.factory decorators to decide + whether to raise if a factory already exists. Allows decorator to run + multiple times with the same function. + + func1 (Callable): The first function. + func2 (Callable): The second function. + RETURNS (bool): Whether it's the same function (most likely). + """ + if not callable(func1) or not callable(func2): + return False + same_name = func1.__qualname__ == func2.__qualname__ + same_file = inspect.getfile(func1) == inspect.getfile(func2) + same_code = inspect.getsourcelines(func1) == inspect.getsourcelines(func2) + return same_name and same_file and same_code + + def get_cuda_stream( require: bool = False, non_blocking: bool = True ) -> Optional[CudaStream]: diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index 98f267e87..fc72eda98 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -12,7 +12,8 @@ The attribute ruler lets you set token attributes for tokens identified by [`Matcher` patterns](/usage/rule-based-matching#matcher). The attribute ruler is typically used to handle exceptions for token attributes and to map values between attributes such as mapping fine-grained POS tags to coarse-grained POS -tags. +tags. See the [usage guide](/usage/linguistic-features/#mappings-exceptions) for +examples. ## Config and implementation {#config} @@ -138,6 +139,21 @@ Get all patterns that have been added to the attribute ruler in the | ----------- | -------------------------------------------------------------------------------------------- | | **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ | +## AttributeRuler.score {#score tag="method" new="3"} + +Score a batch of examples. + +> #### Example +> +> ```python +> scores = attribute_ruler.score(examples) +> ``` + +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ | + ## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"} Load attribute ruler patterns from a tag map. diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 7ce95c019..9070855fa 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -146,8 +146,12 @@ validation error with more details. > #### Example > > ```cli -> $ python -m spacy init fill-config base.cfg config.cfg +> $ python -m spacy init fill-config base.cfg config.cfg --diff > ``` +> +> #### Example diff +> +> ![Screenshot of visual diff in terminal](../images/cli_init_fill-config_diff.jpg) ```cli $ python -m spacy init fill-config [base_path] [output_file] [--diff] @@ -242,19 +246,19 @@ some config validation errors are blocking and will prevent the rest of the config from being resolved. This means that you may not see all validation errors at once and some issues are only shown once previous errors have been fixed. To auto-fill a partial config and save the result, you can use the -[`init fillconfig`](/api/cli#init-fill-config) command. +[`init fill-config`](/api/cli#init-fill-config) command. ```cli -$ python -m spacy debug config [config_path] [--code_path] [overrides] +$ python -m spacy debug config [config_path] [--code-path] [--show-functions] [--show-variables] [overrides] ``` > #### Example > > ```cli -> $ python -m spacy debug config ./config.cfg +> $ python -m spacy debug config config.cfg > ``` - + ``` ✘ Config validation error @@ -273,13 +277,127 @@ python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/start -| Name | Description | -| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | -| `--code_path`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | -| **PRINTS** | Config validation errors, if available. | + + +```cli +$ python -m spacy debug config ./config.cfg --show-functions --show-variables +``` + +``` +============================= Config validation ============================= +✔ Config is valid + +=============================== Variables (6) =============================== + +Variable Value +----------------------------------------- ---------------------------------- +${components.tok2vec.model.encode.width} 96 +${paths.dev} 'hello' +${paths.init_tok2vec} None +${paths.raw} None +${paths.train} '' +${system.seed} 0 + + +========================= Registered functions (17) ========================= +ℹ [nlp.tokenizer] +Registry @tokenizers +Name spacy.Tokenizer.v1 +Module spacy.language +File /path/to/spacy/language.py (line 64) +ℹ [components.ner.model] +Registry @architectures +Name spacy.TransitionBasedParser.v1 +Module spacy.ml.models.parser +File /path/to/spacy/ml/models/parser.py (line 11) +ℹ [components.ner.model.tok2vec] +Registry @architectures +Name spacy.Tok2VecListener.v1 +Module spacy.ml.models.tok2vec +File /path/to/spacy/ml/models/tok2vec.py (line 16) +ℹ [components.parser.model] +Registry @architectures +Name spacy.TransitionBasedParser.v1 +Module spacy.ml.models.parser +File /path/to/spacy/ml/models/parser.py (line 11) +ℹ [components.parser.model.tok2vec] +Registry @architectures +Name spacy.Tok2VecListener.v1 +Module spacy.ml.models.tok2vec +File /path/to/spacy/ml/models/tok2vec.py (line 16) +ℹ [components.tagger.model] +Registry @architectures +Name spacy.Tagger.v1 +Module spacy.ml.models.tagger +File /path/to/spacy/ml/models/tagger.py (line 9) +ℹ [components.tagger.model.tok2vec] +Registry @architectures +Name spacy.Tok2VecListener.v1 +Module spacy.ml.models.tok2vec +File /path/to/spacy/ml/models/tok2vec.py (line 16) +ℹ [components.tok2vec.model] +Registry @architectures +Name spacy.Tok2Vec.v1 +Module spacy.ml.models.tok2vec +File /path/to/spacy/ml/models/tok2vec.py (line 72) +ℹ [components.tok2vec.model.embed] +Registry @architectures +Name spacy.MultiHashEmbed.v1 +Module spacy.ml.models.tok2vec +File /path/to/spacy/ml/models/tok2vec.py (line 93) +ℹ [components.tok2vec.model.encode] +Registry @architectures +Name spacy.MaxoutWindowEncoder.v1 +Module spacy.ml.models.tok2vec +File /path/to/spacy/ml/models/tok2vec.py (line 207) +ℹ [training.logger] +Registry @loggers +Name spacy.ConsoleLogger.v1 +Module spacy.gold.loggers +File /path/to/spacy/gold/loggers.py (line 8) +ℹ [training.batcher] +Registry @batchers +Name batch_by_words.v1 +Module spacy.gold.batchers +File /path/to/spacy/gold/batchers.py (line 49) +ℹ [training.batcher.size] +Registry @schedules +Name compounding.v1 +Module thinc.schedules +File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 43) +ℹ [training.dev_corpus] +Registry @readers +Name spacy.Corpus.v1 +Module spacy.gold.corpus +File /path/to/spacy/gold/corpus.py (line 18) +ℹ [training.optimizer] +Registry @optimizers +Name Adam.v1 +Module thinc.optimizers +File /Users/ines/Repos/explosion/thinc/thinc/optimizers.py (line 58) +ℹ [training.optimizer.learn_rate] +Registry @schedules +Name warmup_linear.v1 +Module thinc.schedules +File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 91) +ℹ [training.train_corpus] +Registry @readers +Name spacy.Corpus.v1 +Module spacy.gold.corpus +File /path/to/spacy/gold/corpus.py (line 18) +``` + + + +| Name | Description | +| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | +| `--code-path`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~ | +| `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | +| **PRINTS** | Config validation errors, if available. | ### debug data {#debug-data tag="command"} @@ -660,8 +778,10 @@ for more info. As of spaCy v3.0, the `pretrain` command takes the same [config file](/usage/training#config) as the `train` command. This ensures that settings are consistent between pretraining and training. Settings for -pretraining can be defined in the `[pretraining]` block of the config file. See -the [data format](/api/data-formats#config) for details. +pretraining can be defined in the `[pretraining]` block of the config file and +auto-generated by setting `--pretraining` on +[`init fill-config`](/api/cli#init-fill-config). Also see the +[data format](/api/data-formats#config) for details. @@ -933,6 +1053,41 @@ $ python -m spacy project pull [remote] [project_dir] | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **DOWNLOADS** | All project outputs that do not exist locally and can be found in the remote. | +### project document {#project-document tag="command"} + +Auto-generate a pretty Markdown-formatted `README` for your project, based on +its [`project.yml`](/usage/projects#project-yml). Will create sections that +document the available commands, workflows and assets. The auto-generated +content will be placed between two hidden markers, so you can add your own +custom content before or after the auto-generated documentation. When you re-run +the `project document` command, only the auto-generated part is replaced. + +```cli +$ python -m spacy project document [project_dir] [--output] [--no-emoji] +``` + +> #### Example +> +> ```cli +> $ python -m spacy project document --output README.md +> ``` + + + +For more examples, see the templates in our +[`projects`](https://github.com/explosion/projects) repo. + +![Screenshot of auto-generated Markdown Readme](../images/project_document.jpg) + + + +| Name | Description | +| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `--output`, `-o` | Path to output file or `-` for stdout (default). If a file is specified and it already exists and contains auto-generated docs, only the auto-generated docs section is replaced. ~~Path (positional)~~ | +|  `--no-emoji`, `-NE` | Don't use emoji in the titles. ~~bool (flag)~~ | +| **CREATES** | The Markdown-formatted project documentation. | + ### project dvc {#project-dvc tag="command"} Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 727c0f35c..8ef8041ee 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -74,15 +74,16 @@ your config and check that it's valid, you can run the Defines the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. -| Name | Description | -| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ | -| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ | -| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ | -| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ | -| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | -| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | -| `tokenizer` | The tokenizer to use. Defaults to [`Tokenizer`](/api/tokenizer). ~~Callable[[str], Doc]~~ | +| Name | Description | +| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ | +| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ | +| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a model is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ | +| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ | +| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ | +| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `tokenizer` | The tokenizer to use. Defaults to [`Tokenizer`](/api/tokenizer). ~~Callable[[str], Doc]~~ | ### components {#config-components tag="section"} @@ -375,7 +376,8 @@ The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the "token-to-vector" embedding layer of pipeline components from raw text. Raw text can be provided as a `.jsonl` (newline-delimited JSON) file containing one input text per line (roughly paragraph length is good). Optionally, custom -tokenization can be provided. +tokenization can be provided. The JSONL format means that the texts can be read +in line-by-line, while still making it easy to represent newlines in the data. > #### Tip: Writing JSONL > diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 34e3569a7..e2668c522 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -357,35 +357,6 @@ their original weights after the block. | -------- | ------------------------------------------------------ | | `params` | A dictionary of parameters keyed by model ID. ~~dict~~ | -## Language.create_pipe {#create_pipe tag="method" new="2"} - -Create a pipeline component from a factory. - - - -As of v3.0, the [`Language.add_pipe`](/api/language#add_pipe) method also takes -the string name of the factory, creates the component, adds it to the pipeline -and returns it. The `Language.create_pipe` method is now mostly used internally. -To create a component and add it to the pipeline, you should always use -`Language.add_pipe`. - - - -> #### Example -> -> ```python -> parser = nlp.create_pipe("parser") -> ``` - -| Name | Description | -| ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `factory_name` | Name of the registered component factory. ~~str~~ | -| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ | -| _keyword-only_ | | -| `config` 3 | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ | -| `validate` 3 | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | -| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ | - ## Language.add_pipe {#add_pipe tag="method" new="2"} Add a component to the processing pipeline. Expects a name that maps to a @@ -434,6 +405,35 @@ component, adds it to the pipeline and returns it. | `validate` 3 | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | | **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ | +## Language.create_pipe {#create_pipe tag="method" new="2"} + +Create a pipeline component from a factory. + + + +As of v3.0, the [`Language.add_pipe`](/api/language#add_pipe) method also takes +the string name of the factory, creates the component, adds it to the pipeline +and returns it. The `Language.create_pipe` method is now mostly used internally. +To create a component and add it to the pipeline, you should always use +`Language.add_pipe`. + + + +> #### Example +> +> ```python +> parser = nlp.create_pipe("parser") +> ``` + +| Name | Description | +| ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `factory_name` | Name of the registered component factory. ~~str~~ | +| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ | +| _keyword-only_ | | +| `config` 3 | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ | +| `validate` 3 | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | +| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ | + ## Language.has_factory {#has_factory tag="classmethod" new="3"} Check whether a factory name is registered on the `Language` class or subclass. @@ -561,6 +561,54 @@ component function. | `name` | Name of the component to remove. ~~str~~ | | **RETURNS** | A `(name, component)` tuple of the removed component. ~~Tuple[str, Callable[[Doc], Doc]]~~ | +## Language.disable_pipe {#disable_pipe tag="method" new="3"} + +Temporarily disable a pipeline component so it's not run as part of the +pipeline. Disabled components are listed in +[`nlp.disabled`](/api/language#attributes) and included in +[`nlp.components`](/api/language#attributes), but not in +[`nlp.pipeline`](/api/language#pipeline), so they're not run when you process a +`Doc` with the `nlp` object. If the component is already disabled, this method +does nothing. + +> #### Example +> +> ```python +> nlp.add_pipe("ner") +> nlp.add_pipe("textcat") +> assert nlp.pipe_names == ["ner", "textcat"] +> nlp.disable_pipe("ner") +> assert nlp.pipe_names == ["textcat"] +> assert nlp.component_names == ["ner", "textcat"] +> assert nlp.disabled == ["ner"] +> ``` + +| Name | Description | +| ------ | ----------------------------------------- | +| `name` | Name of the component to disable. ~~str~~ | + +## Language.enable_pipe {#enable_pipe tag="method" new="3"} + +Enable a previously disable component (e.g. via +[`Language.disable_pipes`](/api/language#disable_pipes)) so it's run as part of +the pipeline, [`nlp.pipeline`](/api/language#pipeline). If the component is +already enabled, this method does nothing. + +> #### Example +> +> ```python +> nlp.disable_pipe("ner") +> assert "ner" in nlp.disabled +> assert not "ner" in nlp.pipe_names +> nlp.enable_pipe("ner") +> assert not "ner" in nlp.disabled +> assert "ner" in nlp.pipe_names +> ``` + +| Name | Description | +| ------ | ---------------------------------------- | +| `name` | Name of the component to enable. ~~str~~ | + ## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"} Disable one or more pipeline components. If used as a context manager, the @@ -568,7 +616,9 @@ pipeline will be restored to the initial state at the end of the block. Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method you can use to undo your changes. You can specify either `disable` (as a list or string), or `enable`. In the latter case, all components not in the `enable` -list, will be disabled. +list, will be disabled. Under the hood, this method calls into +[`disable_pipe`](/api/language#disable_pipe) and +[`enable_pipe`](/api/language#enable_pipe). > #### Example > @@ -860,18 +910,21 @@ available to the loaded object. ## Attributes {#attributes} -| Name | Description | -| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | A container for the lexical types. ~~Vocab~~ | -| `tokenizer` | The tokenizer. ~~Tokenizer~~ | -| `make_doc` | Callable that takes a string and returns a `Doc`. ~~Callable[[str], Doc]~~ | -| `pipeline` | List of `(name, component)` tuples describing the current processing pipeline, in order. ~~List[str, Callable[[Doc], Doc]]~~ | -| `pipe_names` 2 | List of pipeline component names, in order. ~~List[str]~~ | -| `pipe_labels` 2.2 | List of labels set by the pipeline components, if available, keyed by component name. ~~Dict[str, List[str]]~~ | -| `pipe_factories` 2.2 | Dictionary of pipeline component names, mapped to their factory names. ~~Dict[str, str]~~ | -| `factories` | All available factory functions, keyed by name. ~~Dict[str, Callable[[...], Callable[[Doc], Doc]]]~~ | -| `factory_names` 3 | List of all available factory names. ~~List[str]~~ | -| `path` 2 | Path to the model data directory, if a model is loaded. Otherwise `None`. ~~Optional[Path]~~ | +| Name | Description | +| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | A container for the lexical types. ~~Vocab~~ | +| `tokenizer` | The tokenizer. ~~Tokenizer~~ | +| `make_doc` | Callable that takes a string and returns a `Doc`. ~~Callable[[str], Doc]~~ | +| `pipeline` | List of `(name, component)` tuples describing the current processing pipeline, in order. ~~List[Tuple[str, Callable[[Doc], Doc]]]~~ | +| `pipe_names` 2 | List of pipeline component names, in order. ~~List[str]~~ | +| `pipe_labels` 2.2 | List of labels set by the pipeline components, if available, keyed by component name. ~~Dict[str, List[str]]~~ | +| `pipe_factories` 2.2 | Dictionary of pipeline component names, mapped to their factory names. ~~Dict[str, str]~~ | +| `factories` | All available factory functions, keyed by name. ~~Dict[str, Callable[[...], Callable[[Doc], Doc]]]~~ | +| `factory_names` 3 | List of all available factory names. ~~List[str]~~ | +| `components` 3 | List of all available `(name, component)` tuples, including components that are currently disabled. ~~List[Tuple[str, Callable[[Doc], Doc]]]~~ | +| `component_names` 3 | List of all available component names, including components that are currently disabled. ~~List[str]~~ | +| `disabled` 3 | Names of components that are currently disabled and don't run as part of the pipeline. ~~List[str]~~ | +| `path` 2 | Path to the model data directory, if a model is loaded. Otherwise `None`. ~~Optional[Path]~~ | ## Class attributes {#class-attributes} diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 8417fd5e8..45a8736db 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -25,9 +25,10 @@ added to your pipeline, and not a hidden part of the vocab that runs behind the scenes. This makes it easier to customize how lemmas should be assigned in your pipeline. -If the lemmatization mode is set to `"rule"` and requires part-of-speech tags to -be assigned, make sure a [`Tagger`](/api/tagger) or another component assigning -tags is available in the pipeline and runs _before_ the lemmatizer. +If the lemmatization mode is set to `"rule"`, which requires coarse-grained POS +(`Token.pos`) to be assigned, make sure a [`Tagger`](/api/tagger), +[`Morphologizer`](/api/morphologizer) or another component assigning POS is +available in the pipeline and runs _before_ the lemmatizer. diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index 06bef32ba..3d9f61e8d 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -10,7 +10,7 @@ api_trainable: true --- A trainable pipeline component for sentence segmentation. For a simpler, -ruse-based strategy, see the [`Sentencizer`](/api/sentencizer). +rule-based strategy, see the [`Sentencizer`](/api/sentencizer). ## Config and implementation {#config} diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index b255b2261..af0e3af3c 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -252,7 +252,7 @@ Score a batch of examples. | Name | Description | | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. ~~Dict[str, float]~~ | +| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Dict[str, float]~~ | ## Tagger.create_optimizer {#create_optimizer tag="method"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 797fa0191..c2cc5bbab 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -23,6 +23,14 @@ path, spaCy will assume it's a data directory, load its information to construct the `Language` class. The data will be loaded in via [`Language.from_disk`](/api/language#from_disk). + + +As of v3.0, the `disable` keyword argument specifies components to load but +disable, instead of components to not load at all. Those components can now be +specified separately using the new `exclude` keyword argument. + + + > #### Example > > ```python @@ -30,16 +38,17 @@ information to construct the `Language` class. The data will be loaded in via > nlp = spacy.load("/path/to/en") # string path > nlp = spacy.load(Path("/path/to/en")) # pathlib Path > -> nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"]) +> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"]) > ``` -| Name | Description | -| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `name` | Model to load, i.e. package name or path. ~~Union[str, Path]~~ | -| _keyword-only_ | | -| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | -| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | -| **RETURNS** | A `Language` object with the loaded model. ~~Language~~ | +| Name | Description | +| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Model to load, i.e. package name or path. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | +| `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | +| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | +| **RETURNS** | A `Language` object with the loaded model. ~~Language~~ | Essentially, `spacy.load()` is a convenience wrapper that reads the model's [`config.cfg`](/api/data-formats#config), uses the language and pipeline @@ -562,17 +571,18 @@ and create a `Language` object. The model data will then be loaded in via > > ```python > nlp = util.load_model("en_core_web_sm") -> nlp = util.load_model("en_core_web_sm", disable=["ner"]) +> nlp = util.load_model("en_core_web_sm", exclude=["ner"]) > nlp = util.load_model("/path/to/data") > ``` -| Name | Description | -| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | Package name or model path. ~~str~~ | -| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | -| `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ | -| `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | -| **RETURNS** | `Language` class with the loaded model. ~~Language~~ | +| Name | Description | +| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Package name or model path. ~~str~~ | +| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | +| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | +| `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | +| `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | +| **RETURNS** | `Language` class with the loaded model. ~~Language~~ | ### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"} @@ -588,13 +598,14 @@ A helper function to use in the `load()` method of a model package's > return load_model_from_init_py(__file__, **overrides) > ``` -| Name | Description | -| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| `init_file` | Path to model's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ | -| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | -| `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ | -| `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | -| **RETURNS** | `Language` class with the loaded model. ~~Language~~ | +| Name | Description | +| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `init_file` | Path to model's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ | +| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | +| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | +| `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | +| `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | +| **RETURNS** | `Language` class with the loaded model. ~~Language~~ | ### util.load_config {#util.load_config tag="function" new="3"} diff --git a/website/docs/images/cli_init_fill-config_diff.jpg b/website/docs/images/cli_init_fill-config_diff.jpg new file mode 100644 index 000000000..3e3751726 Binary files /dev/null and b/website/docs/images/cli_init_fill-config_diff.jpg differ diff --git a/website/docs/images/project_document.jpg b/website/docs/images/project_document.jpg new file mode 100644 index 000000000..7942619a8 Binary files /dev/null and b/website/docs/images/project_document.jpg differ diff --git a/website/docs/usage/101/_language-data.md b/website/docs/usage/101/_language-data.md index 8c3cd48a3..f1fa1f3a2 100644 --- a/website/docs/usage/101/_language-data.md +++ b/website/docs/usage/101/_language-data.md @@ -22,15 +22,15 @@ values are defined in the [`Language.Defaults`](/api/language#defaults). > nlp_de = German() # Includes German data > ``` -| Name | Description | -| ---------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **Stop words**
[`stop_words.py`][stop_words.py] | List of most common words of a language that are often useful to filter out, for example "and" or "I". Matching tokens will return `True` for `is_stop`. | -| **Tokenizer exceptions**
[`tokenizer_exceptions.py`][tokenizer_exceptions.py] | Special-case rules for the tokenizer, for example, contractions like "can't" and abbreviations with punctuation, like "U.K.". | -| **Punctuation rules**
[`punctuation.py`][punctuation.py] | Regular expressions for splitting tokens, e.g. on punctuation or special characters like emoji. Includes rules for prefixes, suffixes and infixes. | -| **Character classes**
[`char_classes.py`][char_classes.py] | Character classes to be used in regular expressions, for example, latin characters, quotes, hyphens or icons. | -| **Lexical attributes**
[`lex_attrs.py`][lex_attrs.py] | Custom functions for setting lexical attributes on tokens, e.g. `like_num`, which includes language-specific words like "ten" or "hundred". | -| **Syntax iterators**
[`syntax_iterators.py`][syntax_iterators.py] | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks). | -| **Lemmatizer**
[`spacy-lookups-data`][spacy-lookups-data] | Lemmatization rules or a lookup-based lemmatization table to assign base forms, for example "be" for "was". | +| Name | Description | +| ----------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Stop words**
[`stop_words.py`][stop_words.py] | List of most common words of a language that are often useful to filter out, for example "and" or "I". Matching tokens will return `True` for `is_stop`. | +| **Tokenizer exceptions**
[`tokenizer_exceptions.py`][tokenizer_exceptions.py] | Special-case rules for the tokenizer, for example, contractions like "can't" and abbreviations with punctuation, like "U.K.". | +| **Punctuation rules**
[`punctuation.py`][punctuation.py] | Regular expressions for splitting tokens, e.g. on punctuation or special characters like emoji. Includes rules for prefixes, suffixes and infixes. | +| **Character classes**
[`char_classes.py`][char_classes.py] | Character classes to be used in regular expressions, for example, Latin characters, quotes, hyphens or icons. | +| **Lexical attributes**
[`lex_attrs.py`][lex_attrs.py] | Custom functions for setting lexical attributes on tokens, e.g. `like_num`, which includes language-specific words like "ten" or "hundred". | +| **Syntax iterators**
[`syntax_iterators.py`][syntax_iterators.py] | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks). | +| **Lemmatizer**
[`lemmatizer.py`][lemmatizer.py] [`spacy-lookups-data`][spacy-lookups-data] | Custom lemmatizer implementation and lemmatization tables. | [stop_words.py]: https://github.com/explosion/spaCy/tree/master/spacy/lang/en/stop_words.py @@ -44,4 +44,6 @@ values are defined in the [`Language.Defaults`](/api/language#defaults). https://github.com/explosion/spaCy/tree/master/spacy/lang/en/lex_attrs.py [syntax_iterators.py]: https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py +[lemmatizer.py]: + https://github.com/explosion/spaCy/tree/master/spacy/lang/fr/lemmatizer.py [spacy-lookups-data]: https://github.com/explosion/spacy-lookups-data diff --git a/website/docs/usage/101/_pipelines.md b/website/docs/usage/101/_pipelines.md index 295aa6e52..0aa821223 100644 --- a/website/docs/usage/101/_pipelines.md +++ b/website/docs/usage/101/_pipelines.md @@ -1,9 +1,9 @@ When you call `nlp` on a text, spaCy first tokenizes the text to produce a `Doc` object. The `Doc` is then processed in several different steps – this is also referred to as the **processing pipeline**. The pipeline used by the -[default models](/models) consists of a tagger, a parser and an entity -recognizer. Each pipeline component returns the processed `Doc`, which is then -passed on to the next component. +[default models](/models) typically include a tagger, a lemmatizer, a parser and +an entity recognizer. Each pipeline component returns the processed `Doc`, which +is then passed on to the next component. ![The processing pipeline](../../images/pipeline.svg) @@ -12,15 +12,16 @@ passed on to the next component. > - **Creates:** Objects, attributes and properties modified and set by the > component. -| Name | Component | Creates | Description | -| -------------- | ------------------------------------------------------------------ | --------------------------------------------------------- | ------------------------------------------------ | -| **tokenizer** | [`Tokenizer`](/api/tokenizer) | `Doc` | Segment text into tokens. | -| **tagger** | [`Tagger`](/api/tagger) | `Token.tag` | Assign part-of-speech tags. | -| **parser** | [`DependencyParser`](/api/dependencyparser) | `Token.head`, `Token.dep`, `Doc.sents`, `Doc.noun_chunks` | Assign dependency labels. | -| **ner** | [`EntityRecognizer`](/api/entityrecognizer) | `Doc.ents`, `Token.ent_iob`, `Token.ent_type` | Detect and label named entities. | -| **lemmatizer** | [`Lemmatizer`](/api/lemmatizer) | `Token.lemma` | Assign base forms. | -| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. | -| **custom** | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. | +| Name | Component | Creates | Description | +| --------------------- | ------------------------------------------------------------------ | --------------------------------------------------------- | ------------------------------------------------ | +| **tokenizer** | [`Tokenizer`](/api/tokenizer) | `Doc` | Segment text into tokens. | +| _processing pipeline_ | | | +| **tagger** | [`Tagger`](/api/tagger) | `Token.tag` | Assign part-of-speech tags. | +| **parser** | [`DependencyParser`](/api/dependencyparser) | `Token.head`, `Token.dep`, `Doc.sents`, `Doc.noun_chunks` | Assign dependency labels. | +| **ner** | [`EntityRecognizer`](/api/entityrecognizer) | `Doc.ents`, `Token.ent_iob`, `Token.ent_type` | Detect and label named entities. | +| **lemmatizer** | [`Lemmatizer`](/api/lemmatizer) | `Token.lemma` | Assign base forms. | +| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. | +| **custom** | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. | The processing pipeline always **depends on the statistical model** and its capabilities. For example, a pipeline can only include an entity recognizer @@ -43,6 +44,8 @@ recognizer doesn't use any features set by the tagger and parser, and so on. This means that you can swap them, or remove single components from the pipeline without affecting the others. However, components may share a "token-to-vector" component like [`Tok2Vec`](/api/tok2vec) or [`Transformer`](/api/transformer). +You can read more about this in the docs on +[embedding layers](/usage/embeddings-transformers#embedding-layers). Custom components may also depend on annotations set by other components. For example, a custom lemmatizer may need the part-of-speech tags assigned, so it'll diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 7648a5d45..75be71845 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -107,7 +107,62 @@ transformer outputs to the [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, giving you access to them after the pipeline has finished running. - +### Example: Shared vs. independent config {#embedding-layers-config} + +The [config system](/usage/training#config) lets you express model configuration +for both shared and independent embedding layers. The shared setup uses a single +[`Tok2Vec`](/api/tok2vec) component with the +[Tok2Vec](/api/architectures#Tok2Vec) architecture. All other components, like +the entity recognizer, use a +[Tok2VecListener](/api/architectures#Tok2VecListener) layer as their model's +`tok2vec` argument, which connects to the `tok2vec` component model. + +```ini +### Shared {highlight="1-2,4-5,19-20"} +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" + +[components.ner] +factory = "ner" + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" + +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +``` + +In the independent setup, the entity recognizer component defines its own +[Tok2Vec](/api/architectures#Tok2Vec) instance. Other components will do the +same. This makes them fully independent and doesn't require an upstream +[`Tok2Vec`](/api/tok2vec) component to be present in the pipeline. + +```ini +### Independent {highlight="7-8"} +[components.ner] +factory = "ner" + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" + +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2Vec.v1" + +[components.ner.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" + +[components.ner.model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +``` @@ -124,7 +179,7 @@ interoperates with [PyTorch](https://pytorch.org) and the giving you access to thousands of pretrained models for your pipelines. There are many [great guides](http://jalammar.github.io/illustrated-transformer/) to transformer models, but for practical purposes, you can simply think of them as -a drop-in replacement that let you achieve **higher accuracy** in exchange for +drop-in replacements that let you achieve **higher accuracy** in exchange for **higher training and runtime costs**. ### Setup and installation {#transformers-installation} @@ -170,10 +225,12 @@ transformers as subnetworks directly, you can also use them via the ![The processing pipeline with the transformer component](../images/pipeline_transformer.svg) -The `Transformer` component sets the +By default, the `Transformer` component sets the [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, which lets you access the transformers outputs at runtime. + + ```cli $ python -m spacy download en_core_trf_lg ``` @@ -194,8 +251,8 @@ for doc in nlp.pipe(["some text", "some other text"]): tokvecs = doc._.trf_data.tensors[-1] ``` -You can also customize how the [`Transformer`](/api/transformer) component sets -annotations onto the [`Doc`](/api/doc), by customizing the `annotation_setter`. +You can customize how the [`Transformer`](/api/transformer) component sets +annotations onto the [`Doc`](/api/doc), by changing the `annotation_setter`. This callback will be called with the raw input and output data for the whole batch, along with the batch of `Doc` objects, allowing you to implement whatever you need. The annotation setter is called with a batch of [`Doc`](/api/doc) @@ -204,13 +261,15 @@ containing the transformers data for the batch. ```python def custom_annotation_setter(docs, trf_data): - # TODO: - ... + doc_data = list(trf_data.doc_data) + for doc, data in zip(docs, doc_data): + doc._.custom_attr = data nlp = spacy.load("en_core_trf_lg") nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter doc = nlp("This is a text") -print() # TODO: +assert isinstance(doc._.custom_attr, TransformerData) +print(doc._.custom_attr.tensors) ``` ### Training usage {#transformers-training} @@ -244,7 +303,7 @@ component: > > ```python > from spacy_transformers import Transformer, TransformerModel -> from spacy_transformers.annotation_setters import null_annotation_setter +> from spacy_transformers.annotation_setters import configure_trfdata_setter > from spacy_transformers.span_getters import get_doc_spans > > trf = Transformer( @@ -254,7 +313,7 @@ component: > get_spans=get_doc_spans, > tokenizer_config={"use_fast": True}, > ), -> annotation_setter=null_annotation_setter, +> annotation_setter=configure_trfdata_setter(), > max_batch_items=4096, > ) > ``` @@ -274,7 +333,7 @@ tokenizer_config = {"use_fast": true} @span_getters = "doc_spans.v1" [components.transformer.annotation_setter] -@annotation_setters = "spacy-transformers.null_annotation_setter.v1" +@annotation_setters = "spacy-transformers.trfdata_setter.v1" ``` @@ -288,9 +347,9 @@ in a block starts with `@`, it's **resolved to a function** and all other settings are passed to the function as arguments. In this case, `name`, `tokenizer_config` and `get_spans`. -`get_spans` is a function that takes a batch of `Doc` object and returns lists +`get_spans` is a function that takes a batch of `Doc` objects and returns lists of potentially overlapping `Span` objects to process by the transformer. Several -[built-in functions](/api/transformer#span-getters) are available – for example, +[built-in functions](/api/transformer#span_getters) are available – for example, to process the whole document or individual sentences. When the config is resolved, the function is created and passed into the model as an argument. @@ -311,13 +370,17 @@ To change any of the settings, you can edit the `config.cfg` and re-run the training. To change any of the functions, like the span getter, you can replace the name of the referenced function – e.g. `@span_getters = "sent_spans.v1"` to process sentences. You can also register your own functions using the -`span_getters` registry: +[`span_getters` registry](/api/top-level#registry). For instance, the following +custom function returns [`Span`](/api/span) objects following sentence +boundaries, unless a sentence succeeds a certain amount of tokens, in which case +subsentences of at most `max_length` tokens are returned. > #### config.cfg > > ```ini > [components.transformer.model.get_spans] > @span_getters = "custom_sent_spans" +> max_length = 25 > ``` ```python @@ -325,18 +388,29 @@ process sentences. You can also register your own functions using the import spacy_transformers @spacy_transformers.registry.span_getters("custom_sent_spans") -def configure_custom_sent_spans(): - # TODO: write custom example - def get_sent_spans(docs): - return [list(doc.sents) for doc in docs] +def configure_custom_sent_spans(max_length: int): + def get_custom_sent_spans(docs): + spans = [] + for doc in docs: + spans.append([]) + for sent in doc.sents: + start = 0 + end = max_length + while end <= len(sent): + spans[-1].append(sent[start:end]) + start += max_length + end += max_length + if start < len(sent): + spans[-1].append(sent[start:len(sent)]) + return spans - return get_sent_spans + return get_custom_sent_spans ``` To resolve the config during training, spaCy needs to know about your custom function. You can make it available via the `--code` argument that can point to a Python file. For more details on training with custom code, see the -[training documentation](/usage/training#custom-code). +[training documentation](/usage/training#custom-functions). ```cli python -m spacy train ./config.cfg --code ./code.py @@ -357,8 +431,8 @@ The same idea applies to task models that power the **downstream components**. Most of spaCy's built-in model creation functions support a `tok2vec` argument, which should be a Thinc layer of type ~~Model[List[Doc], List[Floats2d]]~~. This is where we'll plug in our transformer model, using the -[Tok2VecListener](/api/architectures#Tok2VecListener) layer, which sneakily -delegates to the `Transformer` pipeline component. +[TransformerListener](/api/architectures#TransformerListener) layer, which +sneakily delegates to the `Transformer` pipeline component. ```ini ### config.cfg (excerpt) {highlight="12"} @@ -373,18 +447,18 @@ maxout_pieces = 3 use_upper = false [nlp.pipeline.ner.model.tok2vec] -@architectures = "spacy-transformers.Tok2VecListener.v1" +@architectures = "spacy-transformers.TransformerListener.v1" grad_factor = 1.0 [nlp.pipeline.ner.model.tok2vec.pooling] @layers = "reduce_mean.v1" ``` -The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a -[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops) as the argument -`pooling`, which needs to be of type ~~Model[Ragged, Floats2d]~~. This layer -determines how the vector for each spaCy token will be computed from the zero or -more source rows the token is aligned against. Here we use the +The [TransformerListener](/api/architectures#TransformerListener) layer expects +a [pooling layer](https://thinc.ai/docs/api-layers#reduction-ops) as the +argument `pooling`, which needs to be of type ~~Model[Ragged, Floats2d]~~. This +layer determines how the vector for each spaCy token will be computed from the +zero or more source rows the token is aligned against. Here we use the [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which averages the wordpiece rows. We could instead use [`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom @@ -480,8 +554,9 @@ vectors, but combines them via summation with a smaller table of learned embeddings. ```python -from thinc.api import add, chain, remap_ids, Embed +from thinc.api import add, chain, remap_ids, Embed, FeatureExtractor from spacy.ml.staticvectors import StaticVectors +from spacy.util import registry @registry.architectures("my_example.MyEmbedding.v1") def MyCustomVectors( @@ -503,3 +578,22 @@ def MyCustomVectors( ## Pretraining {#pretraining} + +> #### Raw text format +> +> The raw text can be provided as JSONL (newline-delimited JSON) with a key +> `"text"` per entry. This allows the data to be read in line by line, while +> also allowing you to include newlines in the texts. +> +> ```json +> {"text": "Can I ask where you work now and what you do, and if you enjoy it?"} +> {"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."} +> ``` + +```cli +$ python -m spacy init fill-config config.cfg config_pretrain.cfg --pretraining +``` + +```cli +$ python -m spacy pretrain raw_text.jsonl /output config_pretrain.cfg +``` diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index ede4ab6f9..76858213c 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -52,9 +52,9 @@ $ pip install -U spacy To install additional data tables for lemmatization you can run `pip install spacy[lookups]` or install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) -separately. The lookups package is needed to create blank models with -lemmatization data, and to lemmatize in languages that don't yet come with -pretrained models and aren't powered by third-party libraries. +separately. The lookups package is needed to provide normalization and +lemmatization data for new models and to lemmatize in languages that don't yet +come with pretrained models and aren't powered by third-party libraries. diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index f2ec48d63..726cf0521 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -3,6 +3,8 @@ title: Linguistic Features next: /usage/rule-based-matching menu: - ['POS Tagging', 'pos-tagging'] + - ['Morphology', 'morphology'] + - ['Lemmatization', 'lemmatization'] - ['Dependency Parse', 'dependency-parse'] - ['Named Entities', 'named-entities'] - ['Entity Linking', 'entity-linking'] @@ -10,7 +12,8 @@ menu: - ['Merging & Splitting', 'retokenization'] - ['Sentence Segmentation', 'sbd'] - ['Vectors & Similarity', 'vectors-similarity'] - - ['Language data', 'language-data'] + - ['Mappings & Exceptions', 'mappings-exceptions'] + - ['Language Data', 'language-data'] --- Processing raw text intelligently is difficult: most words are rare, and it's @@ -37,7 +40,7 @@ in the [models directory](/models). -### Rule-based morphology {#rule-based-morphology} +## Morphology {#morphology} Inflectional morphology is the process by which a root form of a word is modified by adding prefixes or suffixes that specify its grammatical function @@ -45,33 +48,157 @@ but do not changes its part-of-speech. We say that a **lemma** (root form) is **inflected** (modified/combined) with one or more **morphological features** to create a surface form. Here are some examples: -| Context | Surface | Lemma | POS |  Morphological Features | -| ---------------------------------------- | ------- | ----- | ---- | ---------------------------------------- | -| I was reading the paper | reading | read | verb | `VerbForm=Ger` | -| I don't watch the news, I read the paper | read | read | verb | `VerbForm=Fin`, `Mood=Ind`, `Tense=Pres` | -| I read the paper yesterday | read | read | verb | `VerbForm=Fin`, `Mood=Ind`, `Tense=Past` | +| Context | Surface | Lemma | POS |  Morphological Features | +| ---------------------------------------- | ------- | ----- | ------ | ---------------------------------------- | +| I was reading the paper | reading | read | `VERB` | `VerbForm=Ger` | +| I don't watch the news, I read the paper | read | read | `VERB` | `VerbForm=Fin`, `Mood=Ind`, `Tense=Pres` | +| I read the paper yesterday | read | read | `VERB` | `VerbForm=Fin`, `Mood=Ind`, `Tense=Past` | -English has a relatively simple morphological system, which spaCy handles using -rules that can be keyed by the token, the part-of-speech tag, or the combination -of the two. The system works as follows: +Morphological features are stored in the [`MorphAnalysis`](/api/morphanalysis) +under `Token.morph`, which allows you to access individual morphological +features. The attribute `Token.morph_` provides the morphological analysis in +the Universal Dependencies +[FEATS](https://universaldependencies.org/format.html#morphological-annotation) +format. -1. The tokenizer consults a - [mapping table](/usage/adding-languages#tokenizer-exceptions) - `TOKENIZER_EXCEPTIONS`, which allows sequences of characters to be mapped to - multiple tokens. Each token may be assigned a part of speech and one or more - morphological features. -2. The part-of-speech tagger then assigns each token an **extended POS tag**. In - the API, these tags are known as `Token.tag`. They express the part-of-speech - (e.g. `VERB`) and some amount of morphological information, e.g. that the - verb is past tense. -3. For words whose POS is not set by a prior process, a - [mapping table](/usage/adding-languages#tag-map) `TAG_MAP` maps the tags to a - part-of-speech and a set of morphological features. -4. Finally, a **rule-based deterministic lemmatizer** maps the surface form, to - a lemma in light of the previously assigned extended part-of-speech and - morphological information, without consulting the context of the token. The - lemmatizer also accepts list-based exception files, acquired from - [WordNet](https://wordnet.princeton.edu/). +> #### 📝 Things to try +> +> 1. Change "I" to "She". You should see that the morphological features change +> and express that it's a pronoun in the third person. +> 2. Inspect `token.morph_` for the other tokens. + +```python +### {executable="true"} +import spacy + +nlp = spacy.load("en_core_web_sm") +print("Pipeline:", nlp.pipe_names) +doc = nlp("I was reading the paper.") +token = doc[0] # 'I' +print(token.morph_) # 'Case=Nom|Number=Sing|Person=1|PronType=Prs' +print(token.morph.get("PronType")) # ['Prs'] +``` + +### Statistical morphology {#morphologizer new="3" model="morphologizer"} + +spaCy's statistical [`Morphologizer`](/api/morphologizer) component assigns the +morphological features and coarse-grained part-of-speech tags as `Token.morph` +and `Token.pos`. + +```python +### {executable="true"} +import spacy + +nlp = spacy.load("de_core_news_sm") +doc = nlp("Wo bist du?") # English: 'Where are you?' +print(doc[2].morph_) # 'Case=Nom|Number=Sing|Person=2|PronType=Prs' +print(doc[2].pos_) # 'PRON' +``` + +### Rule-based morphology {#rule-based-morphology} + +For languages with relatively simple morphological systems like English, spaCy +can assign morphological features through a rule-based approach, which uses the +**token text** and **fine-grained part-of-speech tags** to produce +coarse-grained part-of-speech tags and morphological features. + +1. The part-of-speech tagger assigns each token a **fine-grained part-of-speech + tag**. In the API, these tags are known as `Token.tag`. They express the + part-of-speech (e.g. verb) and some amount of morphological information, e.g. + that the verb is past tense (e.g. `VBD` for a past tense verb in the Penn + Treebank) . +2. For words whose coarse-grained POS is not set by a prior process, a + [mapping table](#mapping-exceptions) maps the fine-grained tags to a + coarse-grained POS tags and morphological features. + +```python +### {executable="true"} +import spacy + +nlp = spacy.load("en_core_web_sm") +doc = nlp("Where are you?") +print(doc[2].morph_) # 'Case=Nom|Person=2|PronType=Prs' +print(doc[2].pos_) # 'PRON' +``` + +## Lemmatization {#lemmatization model="lemmatizer" new="3"} + +The [`Lemmatizer`](/api/lemmatizer) is a pipeline component that provides lookup +and rule-based lemmatization methods in a configurable component. An individual +language can extend the `Lemmatizer` as part of its +[language data](#language-data). + +```python +### {executable="true"} +import spacy + +# English models include a rule-based lemmatizer +nlp = spacy.load("en_core_web_sm") +lemmatizer = nlp.get_pipe("lemmatizer") +print(lemmatizer.mode) # 'rule' + +doc = nlp("I was reading the paper.") +print([token.lemma_ for token in doc]) +# ['I', 'be', 'read', 'the', 'paper', '.'] +``` + + + +Unlike spaCy v2, spaCy v3 models do _not_ provide lemmas by default or switch +automatically between lookup and rule-based lemmas depending on whether a tagger +is in the pipeline. To have lemmas in a `Doc`, the pipeline needs to include a +[`Lemmatizer`](/api/lemmatizer) component. The lemmatizer component is +configured to use a single mode such as `"lookup"` or `"rule"` on +initialization. The `"rule"` mode requires `Token.pos` to be set by a previous +component. + + + +The data for spaCy's lemmatizers is distributed in the package +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The +provided models already include all the required tables, but if you are creating +new models, you'll probably want to install `spacy-lookups-data` to provide the +data when the lemmatizer is initialized. + +### Lookup lemmatizer {#lemmatizer-lookup} + +For models without a tagger or morphologizer, a lookup lemmatizer can be added +to the pipeline as long as a lookup table is provided, typically through +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The +lookup lemmatizer looks up the token surface form in the lookup table without +reference to the token's part-of-speech or context. + +```python +# pip install spacy-lookups-data +import spacy + +nlp = spacy.blank("sv") +nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) +``` + +### Rule-based lemmatizer {#lemmatizer-rule} + +When training models that include a component that assigns POS (a morphologizer +or a tagger with a [POS mapping](#mappings-exceptions)), a rule-based lemmatizer +can be added using rule tables from +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data): + +```python +# pip install spacy-lookups-data +import spacy + +nlp = spacy.blank("de") +# Morphologizer (note: model is not yet trained!) +nlp.add_pipe("morphologizer") +# Rule-based lemmatizer +nlp.add_pipe("lemmatizer", config={"mode": "rule"}) +``` + +The rule-based deterministic lemmatizer maps the surface form to a lemma in +light of the previously assigned coarse-grained part-of-speech and morphological +information, without consulting the context of the token. The rule-based +lemmatizer also accepts list-based exception files. For English, these are +acquired from [WordNet](https://wordnet.princeton.edu/). ## Dependency Parsing {#dependency-parse model="parser"} @@ -82,6 +209,14 @@ check whether a [`Doc`](/api/doc) object has been parsed with the `doc.is_parsed` attribute, which returns a boolean value. If this attribute is `False`, the default sentence iterator will raise an exception. + + +For a list of the syntactic dependency labels assigned by spaCy's models across +different languages, see the label schemes documented in the +[models directory](/models). + + + ### Noun chunks {#noun-chunks} Noun chunks are "base noun phrases" – flat phrases that have a noun as their @@ -288,11 +423,45 @@ for token in doc: | their | `ADJ` | `poss` | requests | | requests | `NOUN` | `dobj` | submit | - +The dependency parse can be a useful tool for **information extraction**, +especially when combined with other predictions like +[named entities](#named-entities). The following example extracts money and +currency values, i.e. entities labeled as `MONEY`, and then uses the dependency +parse to find the noun phrase they are referring to – for example `"Net income"` +→ `"$9.4 million"`. -For a list of the syntactic dependency labels assigned by spaCy's models across -different languages, see the label schemes documented in the -[models directory](/models). +```python +### {executable="true"} +import spacy + +nlp = spacy.load("en_core_web_sm") +# Merge noun phrases and entities for easier analysis +nlp.add_pipe("merge_entities") +nlp.add_pipe("merge_noun_chunks") + +TEXTS = [ + "Net income was $9.4 million compared to the prior year of $2.7 million.", + "Revenue exceeded twelve billion dollars, with a loss of $1b.", +] +for doc in nlp.pipe(TEXTS): + for token in doc: + if token.ent_type_ == "MONEY": + # We have an attribute and direct object, so check for subject + if token.dep_ in ("attr", "dobj"): + subj = [w for w in token.head.lefts if w.dep_ == "nsubj"] + if subj: + print(subj[0], "-->", token) + # We have a prepositional object with a preposition + elif token.dep_ == "pobj" and token.head.dep_ == "prep": + print(token.head.head, "-->", token) +``` + + + +For more examples of how to write rule-based information extraction logic that +takes advantage of the model's predictions produced by the different components, +see the usage guide on +[combining models and rules](/usage/rule-based-matching#models-rules). @@ -378,7 +547,7 @@ on a token, it will return an empty string. > > #### BILUO Scheme > -> - `B` – Token is the **beginning** of an entity. +> - `B` – Token is the **beginning** of a multi-token entity. > - `I` – Token is **inside** a multi-token entity. > - `L` – Token is the **last** token of a multi-token entity. > - `U` – Token is a single-token **unit** entity. @@ -545,7 +714,7 @@ identifier from a knowledge base (KB). You can create your own [train a new Entity Linking model](/usage/training#entity-linker) using that custom-made KB. -### Accessing entity identifiers {#entity-linking-accessing} +### Accessing entity identifiers {#entity-linking-accessing model="entity linking"} The annotated KB identifier is accessible as either a hash value or as a string, using the attributes `ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span) @@ -571,15 +740,6 @@ print(ent_ada_1) # ['Lovelace', 'PERSON', 'Q7259'] print(ent_london_5) # ['London', 'GPE', 'Q84'] ``` -| Text | ent_type\_ | ent_kb_id\_ | -| -------- | ---------- | ----------- | -| Ada | `"PERSON"` | `"Q7259"` | -| Lovelace | `"PERSON"` | `"Q7259"` | -| was | - | - | -| born | - | - | -| in | - | - | -| London | `"GPE"` | `"Q84"` | - ## Tokenization {#tokenization} Tokenization is the task of splitting a text into meaningful segments, called @@ -717,14 +877,6 @@ subclass. --- - - ### Adding special case tokenization rules {#special-cases} Most domains have at least some idiosyncrasies that require custom tokenization @@ -1439,28 +1591,46 @@ print("After:", [(token.text, token._.is_musician) for token in doc]) ## Sentence Segmentation {#sbd} - - A [`Doc`](/api/doc) object's sentences are available via the `Doc.sents` -property. Unlike other libraries, spaCy uses the dependency parse to determine -sentence boundaries. This is usually more accurate than a rule-based approach, -but it also means you'll need a **statistical model** and accurate predictions. -If your texts are closer to general-purpose news or web text, this should work -well out-of-the-box. For social media or conversational text that doesn't follow -the same rules, your application may benefit from a custom rule-based -implementation. You can either use the built-in -[`Sentencizer`](/api/sentencizer) or plug an entirely custom rule-based function -into your [processing pipeline](/usage/processing-pipelines). +property. To view a `Doc`'s sentences, you can iterate over the `Doc.sents`, a +generator that yields [`Span`](/api/span) objects. You can check whether a `Doc` +has sentence boundaries with the `doc.is_sentenced` attribute. -spaCy's dependency parser respects already set boundaries, so you can preprocess -your `Doc` using custom rules _before_ it's parsed. Depending on your text, this -may also improve accuracy, since the parser is constrained to predict parses -consistent with the sentence boundaries. +```python +### {executable="true"} +import spacy + +nlp = spacy.load("en_core_web_sm") +doc = nlp("This is a sentence. This is another sentence.") +assert doc.is_sentenced +for sent in doc.sents: + print(sent.text) +``` + +spaCy provides four alternatives for sentence segmentation: + +1. [Dependency parser](#sbd-parser): the statistical + [`DependencyParser`](/api/dependencyparser) provides the most accurate + sentence boundaries based on full dependency parses. +2. [Statistical sentence segmenter](#sbd-senter): the statistical + [`SentenceRecognizer`](/api/sentencerecognizer) is a simpler and faster + alternative to the parser that only sets sentence boundaries. +3. [Rule-based pipeline component](#sbd-component): the rule-based + [`Sentencizer`](/api/sentencizer) sets sentence boundaries using a + customizable list of sentence-final punctuation. +4. [Custom function](#sbd-custom): your own custom function added to the + processing pipeline can set sentence boundaries by writing to + `Token.is_sent_start`. ### Default: Using the dependency parse {#sbd-parser model="parser"} -To view a `Doc`'s sentences, you can iterate over the `Doc.sents`, a generator -that yields [`Span`](/api/span) objects. +Unlike other libraries, spaCy uses the dependency parse to determine sentence +boundaries. This is usually the most accurate approach, but it requires a +**statistical model** that provides accurate predictions. If your texts are +closer to general-purpose news or web text, this should work well out-of-the-box +with spaCy's provided models. For social media or conversational text that +doesn't follow the same rules, your application may benefit from a custom model +or rule-based component. ```python ### {executable="true"} @@ -1472,12 +1642,43 @@ for sent in doc.sents: print(sent.text) ``` +spaCy's dependency parser respects already set boundaries, so you can preprocess +your `Doc` using custom components _before_ it's parsed. Depending on your text, +this may also improve parse accuracy, since the parser is constrained to predict +parses consistent with the sentence boundaries. + +### Statistical sentence segmenter {#sbd-senter model="senter" new="3"} + +The [`SentenceRecognizer`](/api/sentencerecognizer) is a simple statistical +component that only provides sentence boundaries. Along with being faster and +smaller than the parser, its primary advantage is that it's easier to train +custom models because it only requires annotated sentence boundaries rather than +full dependency parses. + + + +> #### senter vs. parser +> +> The recall for the `senter` is typically slightly lower than for the parser, +> which is better at predicting sentence boundaries when punctuation is not +> present. + +```python +### {executable="true"} +import spacy + +nlp = spacy.load("en_core_web_sm", enable=["senter"], disable=["parser"]) +doc = nlp("This is a sentence. This is another sentence.") +for sent in doc.sents: + print(sent.text) +``` + ### Rule-based pipeline component {#sbd-component} The [`Sentencizer`](/api/sentencizer) component is a [pipeline component](/usage/processing-pipelines) that splits sentences on punctuation like `.`, `!` or `?`. You can plug it into your pipeline if you only -need sentence boundaries without the dependency parse. +need sentence boundaries without dependency parses. ```python ### {executable="true"} @@ -1504,7 +1705,7 @@ and can still be overwritten by the parser. To prevent inconsistent state, you can only set boundaries **before** a document -is parsed (and `Doc.is_parsed` is `False`). To ensure that your component is +is parsed (and `doc.is_parsed` is `False`). To ensure that your component is added in the right place, you can set `before='parser'` or `first=True` when adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe). @@ -1541,6 +1742,77 @@ doc = nlp(text) print("After:", [sent.text for sent in doc.sents]) ``` +## Mappings & Exceptions {#mappings-exceptions new="3"} + +The [`AttributeRuler`](/api/attributeruler) manages **rule-based mappings and +exceptions** for all token-level attributes. As the number of +[pipeline components](/api/#architecture-pipeline) has grown from spaCy v2 to +v3, handling rules and exceptions in each component individually has become +impractical, so the `AttributeRuler` provides a single component with a unified +pattern format for all token attribute mappings and exceptions. + +The `AttributeRuler` uses +[`Matcher` patterns](/usage/rule-based-matching#adding-patterns) to identify +tokens and then assigns them the provided attributes. If needed, the +[`Matcher`](/api/matcher) patterns can include context around the target token. +For example, the attribute ruler can: + +- provide exceptions for any **token attributes** +- map **fine-grained tags** to **coarse-grained tags** for languages without + statistical morphologizers (replacing the v2.x `tag_map` in the + [language data](#language-data)) +- map token **surface form + fine-grained tags** to **morphological features** + (replacing the v2.x `morph_rules` in the [language data](#language-data)) +- specify the **tags for space tokens** (replacing hard-coded behavior in the + tagger) + +The following example shows how the tag and POS `NNP`/`PROPN` can be specified +for the phrase `"The Who"`, overriding the tags provided by the statistical +tagger and the POS tag map. + +```python +### {executable="true"} +import spacy + +nlp = spacy.load("en_core_web_sm") +text = "I saw The Who perform. Who did you see?" +doc1 = nlp(text) +print(doc1[2].tag_, doc1[2].pos_) # DT DET +print(doc1[3].tag_, doc1[3].pos_) # WP PRON + +# Add attribute ruler with exception for "The Who" as NNP/PROPN NNP/PROPN +ruler = nlp.get_pipe("attribute_ruler") +# Pattern to match "The Who" +patterns = [[{"LOWER": "the"}, {"TEXT": "Who"}]] +# The attributes to assign to the matched token +attrs = {"TAG": "NNP", "POS": "PROPN"} +# Add rules to the attribute ruler +ruler.add(patterns=patterns, attrs=attrs, index=0) # "The" in "The Who" +ruler.add(patterns=patterns, attrs=attrs, index=1) # "Who" in "The Who" + +doc2 = nlp(text) +print(doc2[2].tag_, doc2[2].pos_) # NNP PROPN +print(doc2[3].tag_, doc2[3].pos_) # NNP PROPN +# The second "Who" remains unmodified +print(doc2[5].tag_, doc2[5].pos_) # WP PRON +``` + + + +For easy migration from from spaCy v2 to v3, the +[`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules** +in the v2 format with the methods +[`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and +[`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules). + +```diff +nlp = spacy.blank("en") ++ ruler = nlp.add_pipe("attribute_ruler") ++ ruler.load_from_tag_map(YOUR_TAG_MAP) +``` + + + ## Word vectors and semantic similarity {#vectors-similarity} import Vectors101 from 'usage/101/\_vectors-similarity.md' @@ -1670,7 +1942,7 @@ for word, vector in vector_data.items(): vocab.set_vector(word, vector) ``` -## Language data {#language-data} +## Language Data {#language-data} import LanguageData101 from 'usage/101/\_language-data.md' diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 614f113b3..3636aa3c2 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -220,53 +220,70 @@ available pipeline components and component functions. > ruler = nlp.add_pipe("entity_ruler") > ``` -| String name | Component | Description | -| --------------- | ----------------------------------------------- | ----------------------------------------------------------------------------------------- | -| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. | -| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. | -| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | -| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. | -| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules and dictionaries. | -| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. | -| `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Assign base forms to words. | -| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. | -| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. | -| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | -| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Assign token-to-vector embeddings. | -| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. | +| String name | Component | Description | +| ----------------- | ----------------------------------------------- | ----------------------------------------------------------------------------------------- | +| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. | +| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. | +| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | +| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. | +| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules and dictionaries. | +| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. | +| `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Assign base forms to words. | +| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. | +| `attribute_ruler` | [`AttributeRuler`](/api/attributeruler) | Assign token attribute mappings and rule-based exceptions. | +| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. | +| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | +| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Assign token-to-vector embeddings. | +| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. | -### Disabling and modifying pipeline components {#disabling} +### Disabling, excluding and modifying components {#disabling} If you don't need a particular component of the pipeline – for example, the -tagger or the parser, you can **disable loading** it. This can sometimes make a -big difference and improve loading speed. Disabled component names can be -provided to [`spacy.load`](/api/top-level#spacy.load), -[`Language.from_disk`](/api/language#from_disk) or the `nlp` object itself as a -list: +tagger or the parser, you can **disable or exclude** it. This can sometimes make +a big difference and improve loading and inference speed. There are two +different mechanisms you can use: + +1. **Disable:** The component and its data will be loaded with the model, but it + will be disabled by default and not run as part of the processing pipeline. + To run it, you can explicitly enable it by calling + [`nlp.enable_pipe`](/api/language#enable_pipe). When you save out the `nlp` + object, the disabled component will be included but disabled by default. +2. **Exclude:** Don't load the component and its data with the model. Once the + model is loaded, there will be no reference to the excluded component. + +Disabled and excluded component names can be provided to +[`spacy.load`](/api/top-level#spacy.load) as a list. + + + +> #### 💡 Models with optional components +> +> The `disable` mechanism makes it easy to distribute models with optional +> components that you can enable or disable at runtime. For instance, your model +> may include a statistical _and_ a rule-based component for sentence +> segmentation, and you can choose which one to run depending on your use case. ```python -### Disable loading +# Load the model without the entity recognizer +nlp = spacy.load("en_core_web_sm", exclude=["ner"]) + +# Load the tagger and parser but don't enable them nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"]) +# Explicitly enable the tagger later on +nlp.enable_pipe("tagger") ``` -In some cases, you do want to load all pipeline components and their weights, -because you need them at different points in your application. However, if you -only need a `Doc` object with named entities, there's no need to run all -pipeline components on it – that can potentially make processing much slower. -Instead, you can use the `disable` keyword argument on -[`nlp.pipe`](/api/language#pipe) to temporarily disable the components **during -processing**: + -```python -### Disable for processing -for doc in nlp.pipe(texts, disable=["tagger", "parser"]): - # Do something with the doc here -``` +As of v3.0, the `disable` keyword argument specifies components to load but +disable, instead of components to not load at all. Those components can now be +specified separately using the new `exclude` keyword argument. -If you need to **execute more code** with components disabled – e.g. to reset -the weights or update only some components during training – you can use the -[`nlp.select_pipes`](/api/language#select_pipes) context manager. At the end of -the `with` block, the disabled pipeline components will be restored + + +As a shortcut, you can use the [`nlp.select_pipes`](/api/language#select_pipes) +context manager to temporarily disable certain components for a given block. At +the end of the `with` block, the disabled pipeline components will be restored automatically. Alternatively, `select_pipes` returns an object that lets you call its `restore()` method to restore the disabled components when needed. This can be useful if you want to prevent unnecessary code indentation of large @@ -295,6 +312,14 @@ with nlp.select_pipes(enable="parser"): doc = nlp("I will only be parsed") ``` +The [`nlp.pipe`](/api/language#pipe) method also supports a `disable` keyword +argument if you only want to disable components during processing: + +```python +for doc in nlp.pipe(texts, disable=["tagger", "parser"]): + # Do something with the doc here +``` + Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method to remove pipeline components from an existing pipeline, the [`rename_pipe`](/api/language#rename_pipe) method to rename them, or the @@ -308,6 +333,31 @@ nlp.rename_pipe("ner", "entityrecognizer") nlp.replace_pipe("tagger", my_custom_tagger) ``` +The `Language` object exposes different [attributes](/api/language#attributes) +that let you inspect all available components and the components that currently +run as part of the pipeline. + +> #### Example +> +> ```python +> nlp = spacy.blank("en") +> nlp.add_pipe("ner") +> nlp.add_pipe("textcat") +> assert nlp.pipe_names == ["ner", "textcat"] +> nlp.disable_pipe("ner") +> assert nlp.pipe_names == ["textcat"] +> assert nlp.component_names == ["ner", "textcat"] +> assert nlp.disabled == ["ner"] +> ``` + +| Name | Description | +| --------------------- | ---------------------------------------------------------------- | +| `nlp.pipeline` | `(name, component)` tuples of the processing pipeline, in order. | +| `nlp.pipe_names` | Pipeline component names, in order. | +| `nlp.components` | All `(name, component)` tuples, including disabled components. | +| `nlp.component_names` | All component names, including disabled components. | +| `nlp.disabled` | Names of components that are currently disabled. | + ### Sourcing pipeline components from existing models {#sourced-components new="3"} Pipeline components that are independent can also be reused across models. diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 1aaaeb3af..620526280 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -88,6 +88,12 @@ can also use any private repo you have access to with Git. > - dest: 'assets/training.spacy' > url: 'https://example.com/data.spacy' > checksum: '63373dd656daa1fd3043ce166a59474c' +> - dest: 'assets/development.spacy' +> git: +> repo: 'https://github.com/example/repo' +> branch: 'master' +> path: 'path/developments.spacy' +> checksum: '5113dc04e03f079525edd8df3f4f39e3' > ``` Assets are data files your project needs – for example, the training and @@ -102,6 +108,11 @@ $ cd some_example_project $ python -m spacy project assets ``` +Asset URLs can be a number of different protocols: HTTP, HTTPS, FTP, SSH, and +even cloud storage such as GCS and S3. You can also fetch assets using git, by +replacing the `url` string with a `git` block. spaCy will use Git's "sparse +checkout" feature, to avoid download the whole repository. + ### 3. Run a command {#run} > #### project.yml @@ -215,12 +226,100 @@ https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project. | Section | Description | | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | +| `description` | An optional project description used in [auto-generated docs](#custom-docs). | | `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | -| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. | +| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | | `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | | `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. | +### Data assets {#data-assets} + +Assets are any files that your project might need, like training and development +corpora or pretrained weights for initializing your model. Assets are defined in +the `assets` block of your `project.yml` and can be downloaded using the +[`project assets`](/api/cli#project-assets) command. Defining checksums lets you +verify that someone else running your project will use the same files you used. +Asset URLs can be a number of different **protocols**: HTTP, HTTPS, FTP, SSH, +and even **cloud storage** such as GCS and S3. You can also download assets from +a **Git repo** instead. + +#### Downloading from a URL or cloud storage {#data-assets-url} + +Under the hood, spaCy uses the +[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library so you +can use any protocol it supports. Note that you may need to install extra +dependencies to use certain protocols. + +> #### project.yml +> +> ```yaml +> assets: +> # Download from public HTTPS URL +> - dest: 'assets/training.spacy' +> url: 'https://example.com/data.spacy' +> checksum: '63373dd656daa1fd3043ce166a59474c' +> # Download from Google Cloud Storage bucket +> - dest: 'assets/development.spacy' +> url: 'gs://your-bucket/corpora' +> checksum: '5113dc04e03f079525edd8df3f4f39e3' +> ``` + +| Name | Description | +| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | +| `url` | The URL to download from, using the respective protocol. | +| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | +| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). | + +#### Downloading from a Git repo {#data-assets-git} + +If a `git` block is provided, the asset is downloaded from the given Git +repository. You can download from any repo that you have access to. Under the +hood, this uses Git's "sparse checkout" feature, so you're only downloading the +files you need and not the whole repo. + +> #### project.yml +> +> ```yaml +> assets: +> - dest: 'assets/training.spacy' +> git: +> repo: 'https://github.com/example/repo' +> branch: 'master' +> path: 'path/training.spacy' +> checksum: '63373dd656daa1fd3043ce166a59474c' +> description: 'The training data (5000 examples)' +> ``` + +| Name | Description | +| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | +| `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root.
`branch`: The branch to download from. Defaults to `"master"`. | +| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | +| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). | + +#### Working with private assets {#data-asets-private} + +> #### project.yml +> +> ```yaml +> assets: +> - dest: 'assets/private_training_data.json' +> checksum: '63373dd656daa1fd3043ce166a59474c' +> - dest: 'assets/private_vectors.bin' +> checksum: '5113dc04e03f079525edd8df3f4f39e3' +> ``` + +For many projects, the datasets and weights you're working with might be +company-internal and not available over the internet. In that case, you can +specify the destination paths and a checksum, and leave out the URL. When your +teammates clone and run your project, they can place the files in the respective +directory themselves. The [`project assets`](/api/cli#project-assets) command +will alert about missing files and mismatched checksums, so you can ensure that +others are running your project with the same data. + ### Dependencies and outputs {#deps-outputs} Each command defined in the `project.yml` can optionally define a list of @@ -394,12 +493,39 @@ vars: commands: - name: evaluate script: - - 'python scripts/custom_evaluation.py ${batch_size} ./training/model-best ./corpus/eval.json' + - 'python scripts/custom_evaluation.py ${vars.batch_size} ./training/model-best ./corpus/eval.json' deps: - 'training/model-best' - 'corpus/eval.json' ``` +### Documenting your project {#custom-docs} + +> #### Readme Example +> +> For more examples, see the [`projects`](https://github.com/explosion/projects) +> repo. +> +> ![Screenshot of auto-generated Markdown Readme](../images/project_document.jpg) + +When your custom project is ready and you want to share it with others, you can +use the [`spacy project document`](/api/cli#project-document) command to +**auto-generate** a pretty, Markdown-formatted `README` file based on your +project's `project.yml`. It will list all commands, workflows and assets defined +in the project and include details on how to run the project, as well as links +to the relevant spaCy documentation to make it easy for others to get started +using your project. + +```cli +$ python -m spacy project document --output README.md +``` + +Under the hood, hidden markers are added to identify where the auto-generated +content starts and ends. This means that you can add your own custom content +before or after it and re-running the `project document` command will **only +update the auto-generated part**. This makes it easy to keep your documentation +up to date. + ### Cloning from your own repo {#custom-repo} The [`spacy project clone`](/api/cli#project-clone) command lets you customize @@ -427,25 +553,6 @@ projects.
-### Working with private assets {#private-assets} - -For many projects, the datasets and weights you're working with might be -company-internal and not available via a public URL. In that case, you can -specify the destination paths and a checksum, and leave out the URL. When your -teammates clone and run your project, they can place the files in the respective -directory themselves. The [`spacy project assets`](/api/cli#project-assets) -command will alert about missing files and mismatched checksums, so you can -ensure that others are running your project with the same data. - -```yaml -### project.yml -assets: - - dest: 'assets/private_training_data.json' - checksum: '63373dd656daa1fd3043ce166a59474c' - - dest: 'assets/private_vectors.bin' - checksum: '5113dc04e03f079525edd8df3f4f39e3' -``` - ## Remote Storage {#remote} You can persist your project outputs to a remote storage using the diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 2a47fd264..d5fea9fee 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -142,6 +142,7 @@ add to your pipeline and customize for your use case: > #### Example > > ```python +> # pip install spacy-lookups-data > nlp = spacy.blank("en") > nlp.add_pipe("lemmatizer") > ``` @@ -249,23 +250,26 @@ in your config and see validation errors if the argument values don't match. The following methods, attributes and commands are new in spaCy v3.0. -| Name | Description | -| ----------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). | -| [`Token.morph`](/api/token#attributes) [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. | -| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. | -| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. | -| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. | -| [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. | -| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s | -| [`Language.get_factory_meta`](/api/language#get_factory_meta) [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. | -| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | -| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. | -| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). | -| [`util.load_meta`](/api/top-level#util.load_meta) [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a model's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). | -| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all models installed in the environment. | -| [`init config`](/api/cli#init-config) [`init fill-config`](/api/cli#init-fill-config) [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). | -| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). | +| Name | Description | +| ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). | +| [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. | +| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. | +| [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). | +| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. | +| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. | +| [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. | +| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s | +| [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. | +| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | +| [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. | +| [`Language.disabled`](/api/language#attributes) | Names of disabled components that are not run as part of the pipeline. | +| [`Pipe.score`](/api/pipe#score) | Method on pipeline components that returns a dictionary of evaluation scores. | +| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). | +| [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a model's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). | +| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all models installed in the environment. | +| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). | +| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). | ### New and updated documentation {#new-docs} @@ -300,7 +304,10 @@ format for documenting argument and return types. [Layers & Architectures](/usage/layers-architectures), [Projects](/usage/projects), [Custom pipeline components](/usage/processing-pipelines#custom-components), - [Custom tokenizers](/usage/linguistic-features#custom-tokenizer) + [Custom tokenizers](/usage/linguistic-features#custom-tokenizer), + [Morphology](/usage/linguistic-features#morphology), + [Lemmatization](/usage/linguistic-features#lemmatization), + [Mapping & Exceptions](/usage/linguistic-features#mappings-exceptions) - **API Reference: ** [Library architecture](/api), [Model architectures](/api/architectures), [Data formats](/api/data-formats) - **New Classes: ** [`Example`](/api/example), [`Tok2Vec`](/api/tok2vec), @@ -365,19 +372,27 @@ Note that spaCy v3.0 now requires **Python 3.6+**. [`DependencyMatcher.add`](/api/dependencymatcher#add) now only accept a list of patterns as the second argument (instead of a variable number of arguments). The `on_match` callback becomes an optional keyword argument. +- The `PRON_LEMMA` symbol and `-PRON-` as an indicator for pronoun lemmas has + been removed. +- The `TAG_MAP` and `MORPH_RULES` in the language data have been replaced by the + more flexible [`AttributeRuler`](/api/attributeruler). +- The [`Lemmatizer`](/api/lemmatizer) is now a standalone pipeline component and + doesn't provide lemmas by default or switch automatically between lookup and + rule-based lemmas. You can now add it to your pipeline explicitly and set its + mode on initialization. ### Removed or renamed API {#incompat-removed} -| Removed | Replacement | -| -------------------------------------------------------- | ------------------------------------------------------------------------------------------ | -| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes) | -| `GoldParse` | [`Example`](/api/example) | -| `GoldCorpus` | [`Corpus`](/api/corpus) | -| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | -| `spacy init-model` | [`spacy init model`](/api/cli#init-model) | -| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | -| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | -| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated | +| Removed | Replacement | +| -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | +| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) | +| `GoldParse` | [`Example`](/api/example) | +| `GoldCorpus` | [`Corpus`](/api/corpus) | +| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | +| `spacy init-model` | [`spacy init model`](/api/cli#init-model) | +| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | +| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | +| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated | The following deprecated methods, attributes and arguments were removed in v3.0. Most of them have been **deprecated for a while** and many would previously @@ -394,7 +409,7 @@ on them. | keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` | | `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` | | `verbose` argument on [`Language.evaluate`](/api/language#evaluate) | logging (`DEBUG`) | -| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer) | +| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentencerecognizer) | ## Migrating from v2.x {#migrating} @@ -551,6 +566,24 @@ patterns = [nlp("health care reform"), nlp("healthcare reform")] + matcher.add("HEALTH", patterns, on_match=on_match) ``` +### Migrating tag maps and morph rules {#migrating-training-mappings-exceptions} + +Instead of defining a `tag_map` and `morph_rules` in the language data, spaCy +v3.0 now manages mappings and exceptions with a separate and more flexible +pipeline component, the [`AttributeRuler`](/api/attributeruler). See the +[usage guide](/usage/linguistic-features#mappings-exceptions) for examples. The +`AttributeRuler` provides two handy helper methods +[`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and +[`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules) that let +you load in your existing tag map or morph rules: + +```diff +nlp = spacy.blank("en") +- nlp.vocab.morphology.load_tag_map(YOUR_TAG_MAP) ++ ruler = nlp.add_pipe("attribute_ruler") ++ ruler.load_from_tag_map(YOUR_TAG_MAP) +``` + ### Training models {#migrating-training} To train your models, you should now pretty much always use the @@ -596,8 +629,8 @@ If you've exported a starter config from our values. You can then use the auto-generated `config.cfg` for training: ```diff -### {wrap="true"} -- python -m spacy train en ./output ./train.json ./dev.json --pipeline tagger,parser --cnn-window 1 --bilstm-depth 0 +- python -m spacy train en ./output ./train.json ./dev.json +--pipeline tagger,parser --cnn-window 1 --bilstm-depth 0 + python -m spacy train ./config.cfg --output ./output ``` diff --git a/website/src/components/code.js b/website/src/components/code.js index 740544f43..f075539ea 100644 --- a/website/src/components/code.js +++ b/website/src/components/code.js @@ -169,7 +169,13 @@ function formatCode(html, lang, prompt) { } const result = html .split('\n') - .map((line, i) => (prompt ? replacePrompt(line, prompt, i === 0) : line)) + .map((line, i) => { + let newLine = prompt ? replacePrompt(line, prompt, i === 0) : line + if (lang === 'diff' && !line.startsWith('<')) { + newLine = highlightCode('python', line) + } + return newLine + }) .join('\n') return htmlToReact(result) } diff --git a/website/src/components/juniper.js b/website/src/components/juniper.js index 9f298e1f0..09ab0cba6 100644 --- a/website/src/components/juniper.js +++ b/website/src/components/juniper.js @@ -28,7 +28,6 @@ export default class Juniper extends React.Component { mode: this.props.lang, theme: this.props.theme, }) - const runCode = () => this.execute(outputArea, cm.getValue()) cm.setOption('extraKeys', { 'Shift-Enter': runCode }) Widget.attach(outputArea, this.outputRef) diff --git a/website/src/styles/layout.sass b/website/src/styles/layout.sass index 775523190..3e6a2124b 100644 --- a/website/src/styles/layout.sass +++ b/website/src/styles/layout.sass @@ -65,12 +65,12 @@ --color-subtle-dark: hsl(162, 5%, 60%) --color-green-medium: hsl(108, 66%, 63%) - --color-green-transparent: hsla(108, 66%, 63%, 0.11) + --color-green-transparent: hsla(108, 66%, 63%, 0.12) --color-red-light: hsl(355, 100%, 96%) --color-red-medium: hsl(346, 84%, 61%) --color-red-dark: hsl(332, 64%, 34%) --color-red-opaque: hsl(346, 96%, 89%) - --color-red-transparent: hsla(346, 84%, 61%, 0.11) + --color-red-transparent: hsla(346, 84%, 61%, 0.12) --color-yellow-light: hsl(46, 100%, 95%) --color-yellow-medium: hsl(45, 90%, 55%) --color-yellow-dark: hsl(44, 94%, 27%) @@ -79,11 +79,11 @@ // Syntax Highlighting --syntax-comment: hsl(162, 5%, 60%) --syntax-tag: hsl(266, 72%, 72%) - --syntax-number: hsl(266, 72%, 72%) + --syntax-number: var(--syntax-tag) --syntax-selector: hsl(31, 100%, 71%) - --syntax-operator: hsl(342, 100%, 59%) --syntax-function: hsl(195, 70%, 54%) - --syntax-keyword: hsl(342, 100%, 59%) + --syntax-keyword: hsl(343, 100%, 68%) + --syntax-operator: var(--syntax-keyword) --syntax-regex: hsl(45, 90%, 55%) // Other @@ -354,6 +354,7 @@ body [id]:target &.inserted, &.deleted padding: 2px 0 border-radius: 2px + opacity: 0.9 &.inserted color: var(--color-green-medium) @@ -388,7 +389,6 @@ body [id]:target .token color: var(--color-subtle) - .gatsby-highlight-code-line background-color: var(--color-dark-secondary) border-left: 0.35em solid var(--color-theme) @@ -409,6 +409,7 @@ body [id]:target color: var(--color-subtle) .CodeMirror-line + color: var(--syntax-comment) padding: 0 .CodeMirror-selected @@ -418,26 +419,25 @@ body [id]:target .CodeMirror-cursor border-left-color: currentColor - .cm-variable-2 - color: inherit - font-style: italic + .cm-property, .cm-variable, .cm-variable-2, .cm-meta // decorators + color: var(--color-subtle) .cm-comment color: var(--syntax-comment) - .cm-keyword + .cm-keyword, .cm-builtin color: var(--syntax-keyword) .cm-operator color: var(--syntax-operator) - .cm-string, .cm-builtin + .cm-string color: var(--syntax-selector) .cm-number color: var(--syntax-number) - .cm-def, .cm-meta + .cm-def color: var(--syntax-function) // Jupyter