
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import Matcher
from spacy.tokens import Span
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
pd.set_option('display.max_colwidth', 200)
%matplotlib inline
import trumps tweets
list_tweets = pd.read_csv("trump.csv")
list_tweets.shape
Result:
(678, 1)
Let’s check a few samples of tweets :
list_tweets['Tweets'].sample(2)
Result:
222 wow just starting to hear the democrats who are only thinking obstruct and delay are starting to put out the word that the time and scope of fbi looking into judge kavanaugh and witnesses is not e...
127 yesterday was a bad day for the cuomo brothers new york was lost to the looters thugs radical left and all others forms of lowlife amp scum the governor refuses to accept my offer of a dominating ...
Name: Tweets, dtype: object
Check the subject and object of one of these tweets.
doc = nlp("US election 2020: We put Republicans and Democrats in a group chat.")
for tok in doc:
print(tok.text, " > ", tok.dep_)
Result:
US > compound
election > ROOT
2020 > nummod
: > punct
We > nsubj
put > ROOT
Republicans > dobj
and > cc
Democrats > conj
in > prep
a > det
group > compound
chat > pobj
. > punct
Function to extract the subject and the object (entities) from a sentence.
def getentities_fromtweet(sent):
## part 1
ent1 = ""
ent2 = ""
prv_tok_dep = "" # dependency tag of previous token in the sentence
prv_tok_text = "" # previous token in the sentence
prefix = ""
modifier = ""
for tok in nlp(sent):
## part 2
if tok.dep_ != "punct":
# check: token is a compound word or not
if tok.dep_ == "compound":
prefix = tok.text
if prv_tok_dep == "compound":
prefix = prv_tok_text + " "+ tok.text
# check: token is a modifier or not
if tok.dep_.endswith("mod") == True:
modifier = tok.text
if prv_tok_dep == "compound":
modifier = prv_tok_text + " "+ tok.text
## chunk 3
if tok.dep_.find("subj") == True:
ent1 = modifier +" "+ prefix + " "+ tok.text
prefix = ""
modifier = ""
prv_tok_dep = ""
prv_tok_text = ""
## chunk 4
if tok.dep_.find("obj") == True:
ent2 = modifier +" "+ prefix +" "+ tok.text
## chunk 5
# update variables
prv_tok_dep = tok.dep_
prv_tok_text = tok.text
return [ent1.strip(), ent2.strip()]
it seems to be working as planned. In the sentence, ‘car’ is the subject and ‘200 colors’ is the object.
getentities_fromtweet("the car has 200 colors")
Result:
['car', '200 colors']
The Function below to extract the subject and the object (entities) from the tweets
pairs_entity = []
for i in tqdm(list_tweets["Tweets"]):
pairs_entity.append(getentities_fromtweet(i))
Result:
100%|██████████| 678/678 [00:08<00:00, 78.01it/s]
The list of subject-object pairs from the Tweets Here is a few of them:
pairs_entity[11:20]
Result:
[['why i', 'presidential bid trumpvlog'],
['higher self', 'direct donald j trump'],
['that', 'federal cont'],
['china', 'anywhere world'],
['they', 'away dems'],
['success', 'challenges setbacks'],
['always you', 'one'],
['big things', 'businesses'],
['here that', 'business prospects']]
Using spaCy rule-based matching:
def get_relation(sent):
doc = nlp(sent)
# Matcher class object
matcher = Matcher(nlp.vocab)
#define the pattern
pattern = [{'DEP':'ROOT'},
{'DEP':'prep','OP':"?"},
{'DEP':'agent','OP':"?"},
{'POS':'ADJ','OP':"?"}]
matcher.add("matching_1", None, pattern)
matches = matcher(doc)
k = len(matches) - 1
span = doc[matches[k][1]:matches[k][2]]
return(span.text)
# test the function
get_relation("Faisal completed the task")
Result:
completed
Get the relations from all the dataset:
relations = [get_relation(i) for i in tqdm(list_tweets['Tweets'])]
Result:
100%|██████████| 678/678 [00:08<00:00, 78.54it/s]
The most frequent relations or predicates that we have just extracted
pd.Series(relations).value_counts()[:20]
Result:
is 45
have 26
be 19
thank 18
wow 16
was 14
with 9
see 8
want 8
s 7
let 7
think 7
get 6
said 6
working 6
are 6
yes 5
know 5
has 5
need 5
dtype: int64
Build a Knowledge Graph
# extract subject
source = [i[0] for i in pairs_entity]
# extract object
target = [i[1] for i in pairs_entity ]
kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})
Create a directed-graph from a dataframe
G = nx.from_pandas_edgelist(kg_df, "source", "target",
edge_attr=True, create_using=nx.MultiDiGraph())
Graph all the relations
plt.figure(figsize=(12,12))
pos = nx.spring_layout(G)
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

Let’s graph the relation of “have”
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="have"], "source", "target",
edge_attr=True, create_using=nx.MultiDiGraph()
plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5)
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

Another graph the relation of “thank”
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="thank"], "source", "target",
edge_attr=True, create_using=nx.MultiDiGraph())
plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5)
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

One more graph the relation of “is”
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="thank"], "source", "target",
edge_attr=True, create_using=nx.MultiDiGraph())
plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5)
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

اترك تعليقاً