From c1d21fda77e178b00e9bb389c5d4f5e52fe4ae49 Mon Sep 17 00:00:00 2001 From: Faizan-Ul Huda <61704685+faizanhuda12@users.noreply.github.com> Date: Tue, 18 Nov 2025 22:40:57 -0500 Subject: [PATCH] Update sentencizer.pyx Fixes incorrect sentence segmentation within quoted text by adding quote depth tracking to the sentencizer to correctly handle punctuation inside dialogue. --- spacy/pipeline/sentencizer.pyx | 44 ++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index d2b0a8d4a..09dcded75 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -93,15 +93,49 @@ class Sentencizer(Pipe): if len(doc) > 0: start = 0 seen_period = False + # Track quote nesting depth to defer sentence boundary detection when + # sentence-ending punctuation appears within quoted text + quote_depth = 0 + pending_split_after_quote = False doc_guesses[0] = True for i, token in enumerate(doc): is_in_punct_chars = token.text in self.punct_chars - if seen_period and not token.is_punct and not is_in_punct_chars: - doc_guesses[start] = True - start = token.i - seen_period = False - elif is_in_punct_chars: + + # Update quote depth to track whether the current position is within quoted text. + # This prevents premature sentence splitting when punctuation appears inside quotes. + if token.is_quote: + if token.is_left_punct and token.is_right_punct: + # Symmetric quotes toggle quote state based on current depth. + # If currently outside quotes, the quote acts as an opening quote. + # If currently inside quotes, the quote acts as a closing quote. + quote_depth = 1 if quote_depth == 0 else quote_depth - 1 + elif token.is_left_punct: + # Asymmetric opening quote (e.g., «, ", '). + quote_depth += 1 + elif token.is_right_punct: + # Asymmetric closing quote (e.g., », ", '). + quote_depth -= 1 + # Ensure quote_depth does not go negative to handle unopened quotes gracefully. + if quote_depth < 0: + quote_depth = 0 + + # Handle sentence-ending punctuation. + if is_in_punct_chars: seen_period = True + # Defer sentence boundary until after closing quote when punctuation + # appears within quoted text. This ensures quoted dialogue is not split + # at punctuation marks that appear inside the quotes. + if quote_depth > 0: + pending_split_after_quote = True + elif seen_period and not token.is_punct and not is_in_punct_chars: + # Create sentence boundary when outside quoted text. + # Only split when quote_depth is zero to avoid splitting within quoted dialogue. + if quote_depth == 0: + if pending_split_after_quote: + pending_split_after_quote = False + doc_guesses[start] = True + start = token.i + seen_period = False if start < len(doc): doc_guesses[start] = True guesses.append(doc_guesses)