https://stackabuse.com/python-for-nlp-vocabulary-and-phrase-matching-with-spacy/ ================================================================================ techniques=[ tokenization,stemming,lemmatization,parts_of_speech_tagging, named_entity_recognition,noun_parsing] output=techniques(INDIVIDUAL_WORDS) ================================================================================ Pattern matching defined_patterns="xxxxx" text="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliis esse maiora, illud dubium, ad id, quod summum bonum dicitis, ecquaenam possit fieri accessio. Respondent extrema primis, media utrisque, omnia omnibus. Sequitur disserendi ratio cognitioque naturae" matched_phrases=pattern_matching(text,defined_patterns) ================================================================================ Same functionality reg_expresson="pattern_to_extract_POS_tags" text="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliis esse maiora, illud dubium, ad id, quod summum bonum dicitis, ecquaenam possit fieri accessio. Respondent extrema primis, media utrisque, omnia omnibus. Sequitur disserendi ratio cognitioque naturae" matched_POS_tags=find(reg_expresson,text) ================================================================================ 1. Rule-Based Matching import spacy nlp = spacy.load('en_core_web_sm') from spacy.matcher import Matcher # @ Creating Matcher Object m_tool = Matcher(nlp.vocab) # @ Defining Patterns # @ Patterns are used to filter "similar phrases" from the "given text" # @ Suppose we want to find the following phrases # Pattern: quickbrownfox p1 = [{'LOWER': 'quickbrownfox'}] # Pattern: quick-brown-fox p2 = [{'LOWER': 'quick'}, {'IS_PUNCT': True}, {'LOWER': 'brown'}, {'IS_PUNCT': True}, {'LOWER': 'fox'}] # Pattern: quick brown fox p3 = [{'LOWER': 'quick'}, {'LOWER': 'brown'}, {'LOWER': 'fox'}] # Pattern: quick brownfox p4 = [{'LOWER': 'quick'}, {'LOWER': 'brownfox'}] # The token attribute LOWER defines that the phrase should be converted into lower case before matching. # Add patterns into Matcher object # You can use whatever string for QBF which is the name of matcher m_tool.add('QBF', None, p1, p2, p3, p4) # Prepare sentence sentence = nlp(u'The quick-brown-fox jumps over the lazy dog. The quick brown fox eats well. the quickbrownfox is dead. the dog misses the quick brownfox') # Pass sentence to Matcher object # Output is all the ids of the phrases which are matched in the document, # along with their starting and ending positions in the document phrase_matches=m_tool(sentence) print(phrase_matches) # [(12825528024649263697, 1, 6), # (12825528024649263697, 13, 16), # (12825528024649263697, 21, 22), # (12825528024649263697, 29, 31)] # Four phrases have been matched. # 12825528024649263697: the id of the phrase matched # 1,6: starting and ending positions of the phrase # Id/starting_position/ending_position to string value for match_id, start, end in phrase_matches: string_id = nlp.vocab.strings[match_id] span = sentence[start:end] print(match_id, string_id, start, end, span.text) # 12825528024649263697 QBF 1 6 quick-brown-fox # 12825528024649263697 QBF 13 16 quick brown fox # 12825528024649263697 QBF 21 22 quickbrownfox # 12825528024649263697 QBF 29 31 quick brownfox ================================================================================ 1-1. More Options for Rule-Based Matching m_tool.remove('QBF') # Pattern: "quick--brown--fox" or "quick-brown---fox" by using * p1=[ {'LOWER':'quick'},{'IS_PUNCT':True,'OP':'*'}, {'LOWER':'brown'},{'IS_PUNCT':True,'OP':'*'},{'LOWER': 'fox'}] m_tool.add('QBF', None, p1) The pattern p1 will match "all the phrases" where there are one or more punctuations in the phrase quick brown fox. sentence = nlp(u'The quick--brown--fox jumps over the quick-brown---fox') # What you should match: quick--brown--fox and quick-brown---fox phrase_matches=m_tool(sentence) for match_id, start, end in phrase_matches: string_id = nlp.vocab.strings[match_id] span = sentence[start:end] print(match_id, string_id, start, end, span.text) # 12825528024649263697 QBF 1 6 quick--brown--fox # 12825528024649263697 QBF 10 15 quick-brown---fox ================================================================================ 2. Phrase-based matching # Instead of defining rules, we can directly specify the phrases that we are looking for. This is a more efficient way of phrase matching. # Goal: phrase matching inside a Wikipedia article on Artificial intelligence. # Parse the Wikipedia article: import bs4 as bs import urllib.request import re import nltk scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence') article = scrapped_data .read() # c parsed_article: Wikipedia text parsed_article = bs.BeautifulSoup(article,'lxml') paragraphs = parsed_article.find_all('p') article_text = "" for p in paragraphs: article_text += p.text processed_article = article_text.lower() processed_article = re.sub('[^a-zA-Z]', ' ', processed_article ) processed_article = re.sub(r'\s+', ' ', processed_article) # Phrase-based matching steps import spacy from spacy.matcher import PhraseMatcher nlp = spacy.load('en_core_web_sm') # Create PhraseMatcher Object phrase_matcher = PhraseMatcher(nlp.vocab) # Create a list of phrases to match # Then convert the list to spaCy NLP documents phrases = ['machine learning', 'robots', 'intelligent agents'] patterns = [nlp(text) for text in phrases] # Finally, add your phrase list to phrase_matcher # name of phrase_matcher is AI. phrase_matcher.add('AI', None, *patterns) # Convert processed_article text into spaCy text format sentence=nlp(processed_article) # Apply phrase_matcher to sentence matched_phrases=phrase_matcher(sentence) print("matched_phrases",matched_phrases) # [(5530044837203964789, 37, 39), # (5530044837203964789, 402, 404), # (5530044837203964789, 693, 694), # (5530044837203964789, 1284, 1286), # (5530044837203964789, 3059, 3061), # (5530044837203964789, 3218, 3220), # (5530044837203964789, 3753, 3754), # (5530044837203964789, 5212, 5213), # (5530044837203964789, 5287, 5288), # (5530044837203964789, 6769, 6771), # (5530044837203964789, 6781, 6783), # (5530044837203964789, 7496, 7498), # (5530044837203964789, 7635, 7637), # (5530044837203964789, 8002, 8004), # (5530044837203964789, 9461, 9462), # (5530044837203964789, 9955, 9957), # (5530044837203964789, 10784, 10785), # (5530044837203964789, 11250, 11251), # (5530044837203964789, 12290, 12291), # (5530044837203964789, 12411, 12412), # (5530044837203964789, 12455, 12456)] for match_id, start, end in matched_phrases: string_id = nlp.vocab.strings[match_id] span = sentence[start:end] print(match_id, string_id, start, end, span.text) # 5530044837203964789 AI 37 39 intelligent agents # 5530044837203964789 AI 402 404 machine learning # 5530044837203964789 AI 693 694 robots # 5530044837203964789 AI 1284 1286 machine learning # 5530044837203964789 AI 3059 3061 intelligent agents # 5530044837203964789 AI 3218 3220 machine learning # 5530044837203964789 AI 3753 3754 robots # 5530044837203964789 AI 5212 5213 robots # 5530044837203964789 AI 5287 5288 robots # 5530044837203964789 AI 6769 6771 machine learning # 5530044837203964789 AI 6781 6783 machine learning # 5530044837203964789 AI 7496 7498 machine learning # 5530044837203964789 AI 7635 7637 machine learning # 5530044837203964789 AI 8002 8004 machine learning # 5530044837203964789 AI 9461 9462 robots # 5530044837203964789 AI 9955 9957 machine learning # 5530044837203964789 AI 10784 10785 robots # 5530044837203964789 AI 11250 11251 robots # 5530044837203964789 AI 12290 12291 robots # 5530044837203964789 AI 12411 12412 robots # 5530044837203964789 AI 12455 12456 robots # You can see "all the three phrases" that we tried to search # along with their "start" and "end index" and the "string ids". ================================================================================ Stop Words in spaCy # The spaCy library contains 305 stop words. # Depending upon our requirements, we can add or remove stop words from the spaCy library. # default spaCy stop words import spacy sp = spacy.load('en_core_web_sm') print(sp.Defaults.stop_words) # {'less', 'except', 'top', 'me', 'three', 'fifteen', 'a', 'is', 'those', 'all', 'then # Check stop word sp.vocab['wonder'].is_stop sp.Defaults.stop_words.add('wonder') sp.vocab['wonder'].is_stop = True