mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 09:42:26 +03:00
Merge branch 'master' into fix/enum-python-types
This commit is contained in:
commit
2567266bf7
|
@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<1.0.0
|
typer-slim>=0.3.0,<1.0.0
|
||||||
weasel>=0.1.0,<0.5.0
|
weasel>=0.1.0,<0.5.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=2.0.0,<3.0.0
|
numpy>=2.0.0,<3.0.0
|
||||||
|
|
|
@ -55,7 +55,7 @@ install_requires =
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
weasel>=0.1.0,<0.5.0
|
weasel>=0.1.0,<0.5.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<1.0.0
|
typer-slim>=0.3.0,<1.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0; python_version < "3.9"
|
numpy>=1.15.0; python_version < "3.9"
|
||||||
numpy>=1.19.0; python_version >= "3.9"
|
numpy>=1.19.0; python_version >= "3.9"
|
||||||
|
|
|
@ -170,7 +170,7 @@ def debug_model(
|
||||||
msg.divider(f"STEP 3 - prediction")
|
msg.divider(f"STEP 3 - prediction")
|
||||||
msg.info(str(prediction))
|
msg.info(str(prediction))
|
||||||
|
|
||||||
msg.good(f"Succesfully ended analysis - model looks good.")
|
msg.good(f"Successfully ended analysis - model looks good.")
|
||||||
|
|
||||||
|
|
||||||
def _sentences():
|
def _sentences():
|
||||||
|
|
|
@ -218,6 +218,9 @@ class Lemmatizer(Pipe):
|
||||||
if not form:
|
if not form:
|
||||||
pass
|
pass
|
||||||
elif form in index or not form.isalpha():
|
elif form in index or not form.isalpha():
|
||||||
|
if form in index:
|
||||||
|
forms.insert(0, form)
|
||||||
|
else:
|
||||||
forms.append(form)
|
forms.append(form)
|
||||||
else:
|
else:
|
||||||
oov_forms.append(form)
|
oov_forms.append(form)
|
||||||
|
|
|
@ -260,7 +260,7 @@ labels = ['label1', 'label2']
|
||||||
)
|
)
|
||||||
@pytest.mark.issue(6908)
|
@pytest.mark.issue(6908)
|
||||||
def test_issue6908(component_name):
|
def test_issue6908(component_name):
|
||||||
"""Test intializing textcat with labels in a list"""
|
"""Test initializing textcat with labels in a list"""
|
||||||
|
|
||||||
def create_data(out_file):
|
def create_data(out_file):
|
||||||
nlp = spacy.blank("en")
|
nlp = spacy.blank("en")
|
||||||
|
|
|
@ -740,7 +740,7 @@ def test_pass_doc_to_pipeline(nlp, n_process):
|
||||||
assert len(doc.cats) > 0
|
assert len(doc.cats) > 0
|
||||||
if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
|
if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
|
||||||
# Catch warnings to ensure that all worker processes exited
|
# Catch warnings to ensure that all worker processes exited
|
||||||
# succesfully.
|
# successfully.
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter("error")
|
warnings.simplefilter("error")
|
||||||
docs = nlp.pipe(docs, n_process=n_process)
|
docs = nlp.pipe(docs, n_process=n_process)
|
||||||
|
|
|
@ -32,7 +32,7 @@ we use all four in different places, as they all have different utility:
|
||||||
|
|
||||||
The most important classes in spaCy are defined as `cdef class` objects. The
|
The most important classes in spaCy are defined as `cdef class` objects. The
|
||||||
underlying data for these objects is usually gathered into a struct, which is
|
underlying data for these objects is usually gathered into a struct, which is
|
||||||
usually named `c`. For instance, the [`Lexeme`](/api/cython-classses#lexeme)
|
usually named `c`. For instance, the [`Lexeme`](/api/cython-classes#lexeme)
|
||||||
class holds a [`LexemeC`](/api/cython-structs#lexemec) struct, at `Lexeme.c`.
|
class holds a [`LexemeC`](/api/cython-structs#lexemec) struct, at `Lexeme.c`.
|
||||||
This lets you shed the Python container, and pass a pointer to the underlying
|
This lets you shed the Python container, and pass a pointer to the underlying
|
||||||
data into C-level functions.
|
data into C-level functions.
|
||||||
|
|
|
@ -1,5 +1,74 @@
|
||||||
{
|
{
|
||||||
"resources": [
|
"resources": [
|
||||||
|
{
|
||||||
|
"id": "TeNs",
|
||||||
|
"title": "Temporal Expressions Normalization spaCy",
|
||||||
|
"thumb": "https://github-production-user-asset-6210df.s3.amazonaws.com/40547052/433595900-fae3c9d9-7181-4d8b-8b49-e6dc4fca930b.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20250414%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250414T235545Z&X-Amz-Expires=300&X-Amz-Signature=e21d3c06300ceb15fa1dadd7cb60081cc9f1b35e5a7bfd07f6e8b90dd7fad9d0&X-Amz-SignedHeaders=host",
|
||||||
|
"url": "https://pypi.org/project/temporal-normalization-spacy/",
|
||||||
|
"slogan": "A temporal expression normalization plugin for Romanian using rule-based methods and DBpedia mappings.",
|
||||||
|
"description": "**[Temporal Expressions Normalization spaCy (TeNs)](https://github.com/iliedorobat/timespan-normalization-spacy)** is a powerful pipeline component for spaCy that seamlessly identifies and parses date entities in text. It leverages the **[Temporal Expressions Normalization Framework]( https://github.com/iliedorobat/timespan-normalization)** to recognize a wide variety of date formats using an extensive set of regular expressions (RegEx), ensuring robust and adaptable date extraction across diverse textual sources.\n\nUnlike conventional solutions that primarily focus on well-structured date formats, TeNs excels in handling real-world text by **identifying** not only standard date representations but also **abbreviated, informal, or even misspelled temporal expressions.** This makes it particularly effective for processing noisy or unstructured data, such as historical records, user-generated content, and scanned documents with OCR inaccuracies.",
|
||||||
|
"github": "iliedorobat/timespan-normalization-spacy",
|
||||||
|
"pip": "temporal-normalization-spacy",
|
||||||
|
"code_example": [
|
||||||
|
"import subprocess",
|
||||||
|
"",
|
||||||
|
"import spacy",
|
||||||
|
"",
|
||||||
|
"from temporal_normalization.commons.print_utils import console",
|
||||||
|
"from temporal_normalization.index import create_normalized_component, TemporalNormalization # noqa: F401",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"try:",
|
||||||
|
" # Load the spaCy model if it has already been downloaded",
|
||||||
|
" nlp = spacy.load('ro_core_news_sm')",
|
||||||
|
"except OSError:",
|
||||||
|
" console.warning(f'Started downloading ro_core_news_sm...')",
|
||||||
|
" # Download the Romanian model if it wasn't already downloaded",
|
||||||
|
" subprocess.run(['python', '-m', 'spacy', 'download', 'ro_core_news_sm'])",
|
||||||
|
" # Load the spaCy model",
|
||||||
|
" nlp = spacy.load('ro_core_news_sm')",
|
||||||
|
"",
|
||||||
|
"# Add 'temporal_normalization' component to the spaCy pipeline",
|
||||||
|
"nlp.add_pipe('temporal_normalization', last=True)",
|
||||||
|
"doc = nlp('Sec al II-lea a.ch. - I d.ch reprezintă o perioadă de mari schimbări.')",
|
||||||
|
"",
|
||||||
|
"# Display information about the identified and normalized dates in the text.",
|
||||||
|
"for entity in doc.ents:",
|
||||||
|
" edges = entity._.time_series.edges",
|
||||||
|
"",
|
||||||
|
" print('Start Edge:')",
|
||||||
|
" print(edges.start.serialize('\\t'))",
|
||||||
|
" print()",
|
||||||
|
"",
|
||||||
|
" print('End Edge:')",
|
||||||
|
" print(edges.end.serialize('\\t'))",
|
||||||
|
" print()",
|
||||||
|
"",
|
||||||
|
" print('Periods:')",
|
||||||
|
" for period in entity._.time_series.periods:",
|
||||||
|
" print(period.serialize('\\t'))",
|
||||||
|
" print()",
|
||||||
|
" print('---------------------')"
|
||||||
|
],
|
||||||
|
"code_language": "python",
|
||||||
|
"author": "Ilie Cristian Dorobat",
|
||||||
|
"author_links": {
|
||||||
|
"github": "iliedorobat",
|
||||||
|
"website": "https://iliedorobat.ro/"
|
||||||
|
},
|
||||||
|
"category": [
|
||||||
|
"pipeline",
|
||||||
|
"standalone"
|
||||||
|
],
|
||||||
|
"tags": [
|
||||||
|
"temporal",
|
||||||
|
"normalization",
|
||||||
|
"date",
|
||||||
|
"romanian",
|
||||||
|
"temporal-expression",
|
||||||
|
"dbpedia"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "spacy-vscode",
|
"id": "spacy-vscode",
|
||||||
"title": "spaCy Visual Studio Code Extension",
|
"title": "spaCy Visual Studio Code Extension",
|
||||||
|
|
Loading…
Reference in New Issue
Block a user