Build your own knowledge graph from text, by using Trump tweets, python example

import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import Matcher 
from spacy.tokens import Span 
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
pd.set_option('display.max_colwidth', 200)
%matplotlib inline

import trumps tweets

list_tweets = pd.read_csv("trump.csv")
list_tweets.shape


Result:
(678, 1)

Let’s check a few samples of tweets :

list_tweets['Tweets'].sample(2)

Result:
222    wow just starting to hear the democrats who are only thinking obstruct and delay are starting to put out the word that the time and scope of fbi looking into judge kavanaugh and witnesses is not e...
127    yesterday was a bad day for the cuomo brothers new york was lost to the looters thugs radical left and all others forms of lowlife amp scum the governor refuses to accept my offer of a dominating ...
Name: Tweets, dtype: object

Check the subject and object of one of these tweets.

doc = nlp("US election 2020: We put Republicans and Democrats in a group chat.")
    
for tok in doc:
  print(tok.text, "  >     ", tok.dep_)

Result:
US            >      compound
election      >      ROOT
2020          >      nummod
:             >      punct
We            >      nsubj
put           >      ROOT
Republicans   >      dobj
and           >      cc
Democrats     >      conj
in            >      prep
a             >      det
group         >      compound
chat          >      pobj
.             >      punct

Function to extract the subject and the object (entities) from a sentence.

def getentities_fromtweet(sent):
  ## part  1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence
  prefix = ""
  modifier = ""  
  for tok in nlp(sent):
    ## part 2
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text

  return [ent1.strip(), ent2.strip()]

it seems to be working as planned. In the sentence, ‘car’ is the subject and ‘200 colors’ is the object.

getentities_fromtweet("the car has 200 colors")

Result:
['car', '200  colors']

The Function below to extract the subject and the object (entities) from the tweets

pairs_entity = []
for i in tqdm(list_tweets["Tweets"]):
  pairs_entity.append(getentities_fromtweet(i))

Result:
100%|██████████| 678/678 [00:08<00:00, 78.01it/s]

The list of subject-object pairs from the Tweets Here is a few of them:

pairs_entity[11:20]

Result:
[['why  i', 'presidential bid trumpvlog'],
 ['higher  self', 'direct donald j trump'],
 ['that', 'federal  cont'],
 ['china', 'anywhere  world'],
 ['they', 'away  dems'],
 ['success', 'challenges setbacks'],
 ['always  you', 'one'],
 ['big  things', 'businesses'],
 ['here  that', 'business prospects']]

Using spaCy rule-based matching:

def get_relation(sent):
  doc = nlp(sent)
  # Matcher class object 
  matcher = Matcher(nlp.vocab)
  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 
  matches = matcher(doc)
  k = len(matches) - 1
  span = doc[matches[k][1]:matches[k][2]] 
  return(span.text)
# test the function
get_relation("Faisal completed the task")

Result:
completed

Get the relations from all the dataset:

relations = [get_relation(i) for i in tqdm(list_tweets['Tweets'])]

Result:
100%|██████████| 678/678 [00:08<00:00, 78.54it/s]

The most frequent relations or predicates that we have just extracted

pd.Series(relations).value_counts()[:20]

Result:
is         45
have       26
be         19
thank      18
wow        16
was        14
with        9
see         8
want        8
s           7
let         7
think       7
get         6
said        6
working     6
are         6
yes         5
know        5
has         5
need        5
dtype: int64

Build a Knowledge Graph


# extract subject
source = [i[0] for i in pairs_entity]

# extract object
target = [i[1] for i in pairs_entity ]

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

Create a directed-graph from a dataframe

G = nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

Graph all the relations


plt.figure(figsize=(12,12))
pos = nx.spring_layout(G)
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

Let’s graph the relation of “have”


G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="have"], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph()
plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5) 
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

Another graph the relation of “thank”

G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="thank"], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5) 
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

One more graph the relation of “is”


G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="thank"], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5) 
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()
الصورة الرمزية لـ admin

اترك تعليقاً

لن يتم نشر عنوان بريدك الإلكتروني. الحقول الإلزامية مشار إليها بـ *