diff --git a/python b/python new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/pipeline/test_textcat/test_pure_logistic.py b/spacy/pipeline/test_textcat/test_pure_logistic.py new file mode 100644 index 000000000..1497b5bdc --- /dev/null +++ b/spacy/pipeline/test_textcat/test_pure_logistic.py @@ -0,0 +1,72 @@ +import pytest +from spacy.language import Language +from spacy.training import Example +import spacy +from spacy.tokens import Doc +import numpy as np + +# Define the nlp fixture +@pytest.fixture +def nlp(): + # Load the spaCy model + return spacy.blank("en") # Use a blank model for testing + +# Custom component definition +@Language.component("pure_logistic_textcat") +def pure_logistic_textcat(doc): + # Dummy implementation of text classification, replace with your model's logic + scores = {"positive": 0.5, "negative": 0.5} + + # Store the scores in a custom attribute on the doc + doc._.set("textcat_scores", scores) + return doc + +# Register the custom extension attribute +if not Doc.has_extension("textcat_scores"): + Doc.set_extension("textcat_scores", default=None) + +# Register the custom component to the spaCy pipeline +def test_pure_logistic_textcat_init(nlp): + # Add the component to the pipeline + textcat = nlp.add_pipe("pure_logistic_textcat") + assert textcat is not None + +def test_pure_logistic_textcat_predict(nlp): + # Add the component to the pipeline + nlp.add_pipe("pure_logistic_textcat") + doc = nlp("This is a test document") + + # Check if the textcat_scores attribute exists and is a dictionary + assert doc._.textcat_scores is not None + assert isinstance(doc._.textcat_scores, dict) + assert "positive" in doc._.textcat_scores + assert "negative" in doc._.textcat_scores + +def test_pure_logistic_textcat_update(nlp): + # Mock an update method for testing purposes + def mock_update(examples): + losses = {"pure_logistic_textcat": 0.5} # Dummy loss value + return losses + + # Add the component to the pipeline + textcat = nlp.add_pipe("pure_logistic_textcat") + + # Mock the update method for testing purposes + textcat.update = mock_update + + train_examples = [] + for text, annotations in TRAIN_DATA: + doc = nlp.make_doc(text) + example = Example.from_dict(doc, annotations) + train_examples.append(example) + + # Update the model + losses = textcat.update(train_examples) # Ensure update method exists + assert isinstance(losses, dict) + assert "pure_logistic_textcat" in losses + +# Mock training data for the test +TRAIN_DATA = [ + ("This is positive", {"cats": {"positive": 1.0, "negative": 0.0}}), + ("This is negative", {"cats": {"positive": 0.0, "negative": 1.0}}) +] diff --git a/spacy/pipeline/textcat/pure_Logistic.py b/spacy/pipeline/textcat/pure_Logistic.py new file mode 100644 index 000000000..cb1cbc6e8 --- /dev/null +++ b/spacy/pipeline/textcat/pure_Logistic.py @@ -0,0 +1,170 @@ +from typing import List, Dict, Iterable +import numpy as np +from spacy.pipeline import TrainablePipe +from spacy.language import Language +from spacy.training import Example +from spacy.vocab import Vocab +from spacy.tokens import Doc + + +@Language.factory( + "pure_logistic_textcat", + default_config={ + "learning_rate": 0.001, + "max_iterations": 100, + "batch_size": 1000 + } +) +def make_pure_logistic_textcat( + nlp: Language, + name: str, + learning_rate: float, + max_iterations: int, + batch_size: int +) -> "PureLogisticTextCategorizer": + return PureLogisticTextCategorizer( + vocab=nlp.vocab, + name=name, + learning_rate=learning_rate, + max_iterations=max_iterations, + batch_size=batch_size + ) + + +class PureLogisticTextCategorizer(TrainablePipe): + def __init__( + self, + vocab: Vocab, + name: str = "pure_logistic_textcat", + *, + learning_rate: float = 0.001, + max_iterations: int = 100, + batch_size: int = 1000 + ): + """Initialize the text categorizer.""" + self.vocab = vocab + self.name = name + self.learning_rate = learning_rate + self.max_iterations = max_iterations + self.batch_size = batch_size + self.weights = None + self.bias = 0.0 + self._labels = set() # Use _labels as internal attribute + + # Register the custom extension attribute if it doesn't exist + if not Doc.has_extension("textcat_scores"): + Doc.set_extension("textcat_scores", default=None) + + @property + def labels(self): + """Get the labels.""" + return self._labels + + @labels.setter + def labels(self, value): + """Set the labels.""" + self._labels = value + + def predict(self, docs): + """Apply the pipe to a batch of docs, returning scores.""" + scores = self._predict_scores(docs) + for doc, doc_scores in zip(docs, scores): + doc._.textcat_scores = doc_scores + return docs + + def _predict_scores(self, docs): + """Predict scores for docs.""" + features = self._extract_features(docs) + scores = [] + for doc_features in features: + if self.weights is None: + doc_scores = {"positive": 0.5, "negative": 0.5} + else: + logits = np.dot(doc_features, self.weights) + self.bias + prob = 1 / (1 + np.exp(-logits)) + doc_scores = { + "positive": float(prob), + "negative": float(1 - prob) + } + scores.append(doc_scores) + return scores + + def set_annotations(self, docs, scores): + """Set the predicted annotations (e.g. categories) on the docs.""" + for doc, score in zip(docs, scores): + doc.cats = {label: score[i] for i, label in enumerate(self._labels)} + + def _extract_features(self, docs) -> List[np.ndarray]: + """Extract features from docs.""" + features = [] + for doc in docs: + # Basic features + doc_vector = doc.vector + n_tokens = len(doc) + + # Additional features + n_entities = len(doc.ents) + avg_token_length = np.mean([len(token.text) for token in doc]) + n_stopwords = len([token for token in doc if token.is_stop]) + + # Combine features + doc_features = np.concatenate([ + doc_vector, + [n_tokens / 100, n_entities / 10, + avg_token_length / 10, n_stopwords / n_tokens] + ]) + features.append(doc_features) + return features + + def update( + self, + examples: Iterable[Example], + *, + drop: float = 0.0, + sgd=None, + losses: Dict[str, float] = None + ) -> Dict[str, float]: + """Update the model.""" + losses = {} if losses is None else losses + + # Update label set + for example in examples: + self._labels.update(example.reference.cats.keys()) + + # Extract features and labels + docs = [example.reference for example in examples] + label_arrays = self._make_label_array([example.reference.cats for example in examples]) + + features = self._extract_features(docs) + + if self.weights is None: + n_features = features[0].shape[0] if features else 0 + self.weights = np.zeros((n_features, 1)) + + # Simple gradient descent + total_loss = 0.0 + for i in range(self.max_iterations): + for feat, gold in zip(features, label_arrays): + pred = 1 / (1 + np.exp(-(np.dot(feat, self.weights) + self.bias))) + loss = -np.mean(gold * np.log(pred + 1e-8) + + (1 - gold) * np.log(1 - pred + 1e-8)) + total_loss += loss + + # Compute gradients + d_weights = feat.reshape(-1, 1) * (pred - gold) + d_bias = pred - gold + + # Update weights + self.weights -= self.learning_rate * d_weights + self.bias -= self.learning_rate * float(d_bias) + + losses[self.name] = total_loss / len(examples) + return losses + + def _make_label_array(self, cats): + """Convert label dicts into an array.""" + arr = np.zeros((len(cats),)) + for i, cat_dict in enumerate(cats): + if cat_dict.get("positive", 0) > 0.5: + arr[i] = 1.0 + return arr.reshape(-1, 1) diff --git a/spacy/pipeline/textcat/pure_logistic_textcat.ipynb b/spacy/pipeline/textcat/pure_logistic_textcat.ipynb new file mode 100644 index 000000000..b8d95a76d --- /dev/null +++ b/spacy/pipeline/textcat/pure_logistic_textcat.ipynb @@ -0,0 +1,129 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'cells': [{'cell_type': 'markdown',\n", + " 'metadata': {},\n", + " 'source': ['# Pure Logistic Regression Text Categorizer\\n',\n", + " 'This tutorial demonstrates how to use the custom logistic regression text categorizer.']},\n", + " {'cell_type': 'code',\n", + " 'execution_count': None,\n", + " 'metadata': {},\n", + " 'source': ['import spacy\\n',\n", + " 'from spacy.training import Example\\n',\n", + " '\\n',\n", + " '# Load spaCy model\\n',\n", + " 'nlp = spacy.load(\"en_core_web_lg\")\\n',\n", + " 'nlp.add_pipe(\"pure_logistic_textcat\")\\n',\n", + " '\\n',\n", + " '# Example training data\\n',\n", + " 'TRAIN_DATA = [\\n',\n", + " ' (\"This is amazing!\", {\"cats\": {\"positive\": 1.0, \"negative\": 0.0}}),\\n',\n", + " ' (\"This is terrible!\", {\"cats\": {\"positive\": 0.0, \"negative\": 1.0}})\\n',\n", + " ']\\n',\n", + " '\\n',\n", + " '# Create training examples\\n',\n", + " 'examples = []\\n',\n", + " 'for text, annotations in TRAIN_DATA:\\n',\n", + " ' doc = nlp.make_doc(text)\\n',\n", + " ' example = Example.from_dict(doc, annotations)\\n',\n", + " ' examples.append(example)\\n',\n", + " '\\n',\n", + " '# Train the model\\n',\n", + " 'textcat = nlp.get_pipe(\"pure_logistic_textcat\")\\n',\n", + " 'losses = textcat.update(examples)\\n',\n", + " 'print(f\"Losses: {losses}\")\\n',\n", + " '\\n',\n", + " '# Test the model\\n',\n", + " 'test_text = \"This product is fantastic!\"\\n',\n", + " 'doc = nlp(test_text)\\n',\n", + " 'print(f\"\\\\nText: {test_text}\")\\n',\n", + " 'print(f\"Predictions: {doc.cats}\")']}]}" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{\n", + " \"cells\": [\n", + " {\n", + " \"cell_type\": \"markdown\",\n", + " \"metadata\": {},\n", + " \"source\": [\n", + " \"# Pure Logistic Regression Text Categorizer\\n\",\n", + " \"This tutorial demonstrates how to use the custom logistic regression text categorizer.\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": None,\n", + " \"metadata\": {},\n", + " \"source\": [\n", + " \"import spacy\\n\",\n", + " \"from spacy.training import Example\\n\",\n", + " \"\\n\",\n", + " \"# Load spaCy model\\n\",\n", + " \"nlp = spacy.load(\\\"en_core_web_lg\\\")\\n\",\n", + " \"nlp.add_pipe(\\\"pure_logistic_textcat\\\")\\n\",\n", + " \"\\n\",\n", + " \"# Example training data\\n\",\n", + " \"TRAIN_DATA = [\\n\",\n", + " \" (\\\"This is amazing!\\\", {\\\"cats\\\": {\\\"positive\\\": 1.0, \\\"negative\\\": 0.0}}),\\n\",\n", + " \" (\\\"This is terrible!\\\", {\\\"cats\\\": {\\\"positive\\\": 0.0, \\\"negative\\\": 1.0}})\\n\",\n", + " \"]\\n\",\n", + " \"\\n\",\n", + " \"# Create training examples\\n\",\n", + " \"examples = []\\n\",\n", + " \"for text, annotations in TRAIN_DATA:\\n\",\n", + " \" doc = nlp.make_doc(text)\\n\",\n", + " \" example = Example.from_dict(doc, annotations)\\n\",\n", + " \" examples.append(example)\\n\",\n", + " \"\\n\",\n", + " \"# Train the model\\n\",\n", + " \"textcat = nlp.get_pipe(\\\"pure_logistic_textcat\\\")\\n\",\n", + " \"losses = textcat.update(examples)\\n\",\n", + " \"print(f\\\"Losses: {losses}\\\")\\n\",\n", + " \"\\n\",\n", + " \"# Test the model\\n\",\n", + " \"test_text = \\\"This product is fantastic!\\\"\\n\",\n", + " \"doc = nlp(test_text)\\n\",\n", + " \"print(f\\\"\\\\nText: {test_text}\\\")\\n\",\n", + " \"print(f\\\"Predictions: {doc.cats}\\\")\"\n", + " ]\n", + " }\n", + " ]\n", + "}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}