mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			62 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			62 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """Example of adding a pipeline component to prohibit sentence boundaries
 | |
| before certain tokens.
 | |
| 
 | |
| What we do is write to the token.is_sent_start attribute, which
 | |
| takes values in {True, False, None}. The default value None allows the parser
 | |
| to predict sentence segments. The value False prohibits the parser from inserting
 | |
| a sentence boundary before that token. Note that fixing the sentence segmentation
 | |
| should also improve the parse quality.
 | |
| 
 | |
| The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627
 | |
| Other versions of the model may not make the original mistake, so the specific
 | |
| example might not be apt for future versions.
 | |
| 
 | |
| Compatible with: spaCy v2.0.0+
 | |
| Last tested with: v2.1.0
 | |
| """
 | |
| import plac
 | |
| import spacy
 | |
| 
 | |
| 
 | |
| def prevent_sentence_boundaries(doc):
 | |
|     for token in doc:
 | |
|         if not can_be_sentence_start(token):
 | |
|             token.is_sent_start = False
 | |
|     return doc
 | |
| 
 | |
| 
 | |
| def can_be_sentence_start(token):
 | |
|     if token.i == 0:
 | |
|         return True
 | |
|     # We're not checking for is_title here to ignore arbitrary titlecased
 | |
|     # tokens within sentences
 | |
|     # elif token.is_title:
 | |
|     #    return True
 | |
|     elif token.nbor(-1).is_punct:
 | |
|         return True
 | |
|     elif token.nbor(-1).is_space:
 | |
|         return True
 | |
|     else:
 | |
|         return False
 | |
| 
 | |
| 
 | |
| @plac.annotations(
 | |
|     text=("The raw text to process", "positional", None, str),
 | |
|     spacy_model=("spaCy model to use (with a parser)", "option", "m", str),
 | |
| )
 | |
| def main(text="Been here And I'm loving it.", spacy_model="en_core_web_lg"):
 | |
|     print("Using spaCy model '{}'".format(spacy_model))
 | |
|     print("Processing text '{}'".format(text))
 | |
|     nlp = spacy.load(spacy_model)
 | |
|     doc = nlp(text)
 | |
|     sentences = [sent.text.strip() for sent in doc.sents]
 | |
|     print("Before:", sentences)
 | |
|     nlp.add_pipe(prevent_sentence_boundaries, before="parser")
 | |
|     doc = nlp(text)
 | |
|     sentences = [sent.text.strip() for sent in doc.sents]
 | |
|     print("After:", sentences)
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     plac.call(main)
 |