[N.K. provocation] Results

LDA Topic Model output (20 Topics):

lda_topic

Topic #13 and #17 can be interpreted as “armed provocation” and “nuclear provocation”, respectively. Each of #14 and #18 can be interpreted as “South-North Dialogue” and “international talks”.

The numbers of articles that belong to each category are shown in the graph below

noname01

Independent variable: the inverse degree of support for the unification of the people (1: necessary / 5: unnecessary)

reg.png

[NK Provocation Index][1] Identification 1 – LDA model

CODE:

import re
import os
import sys
import pandas as pd
import numpy as np
import pandas as pd
from pprint import pprint
import random
import gensim
import gensim.corpora as corpora
from konlpy.tag import Twitter
from operator import itemgetter
import datetime as dt 
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

data = []
dirname = 'D://nk//data'
header = {}
k = 0
for f_dir in os.listdir(dirname):
    for fname in os.listdir(os.path.join(dirname, f_dir)):
        k += 1
        if k % 10000 == 0:
            print(k)
        f = open(os.path.join(dirname, f_dir, fname), 'r', encoding = 'utf-8')
        data.append([f_dir, f.read()])
        f.close()

#Document specific Preprocessing

a1 = re.compile('등록\s*\:\s*(\d{4}\s*\-\s*\d{2}\s*\-\s*\d{2})')
a2 = re.compile('입력\s*(\d{4}\s*[\.\-]\s*\d{2}\s*[\.\-]\s*\d{2})')
a2_2 = re.compile('(\d{4}\s*[\.\-]\s*\d{2}\s*[\.\-]\s*\d{2})')
a2_3 = re.compile('등록\s*\:\(\d{4}\-\d{2}\-\d{2})')
a3 = re.compile('((?:19|20)\d{6})')
a4 = re.compile('(\d{4}\s*\-\s*\d{2}\s*\-\s*\d{2})')
for item in data:
    k = 0
    for a in [a1, a2, a2_2, a2_3, a4, a3]:
        if a.search(item[1]):
            date = re.sub('[\s\-\.]', '', a.search(item[1])[1])
            k = 1
            data_date.append([item[0], date, item[1]])
            if len(str(date)) != 8:
                print(item[1][:100])
            break
    #if k == 0 :
        #print(item[1][:30])

twitter = Twitter()

def sent_to_words(sentences):
    return twitter.morphs(sentences)  # deacc=True removes punctuations

def remove_stopwords(texts):
    return [[word for word in preprocess(str(doc)) if word not in stopwords] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def preprocess(doc):
    doc = re.sub('\s+', ' ', doc)
    doc = re.sub('[A-Za-z]+[0-9]+', '', doc)
    doc = re.sub('[a-zA-Z]+', ' ', doc)
    doc = re.sub('\s+', ' ', doc)
    return doc

#Remove all the one character words except for the name of country

country_list = ['미', '북', '러', '중', '일', '한', '군', '핵', '당', '말', '남']
data = [[re.sub('[^가-힣\s\_]', '', word) for word in item] for item in data]
data = [[word for word in item if (len(word) > 1) or (word in country_list)] for item in data]

data_words = list(zip(header, data))

#Remove document specific stopwords

stop_words = ['아티클', '중앙일보', '조선일보', '동아일보', '한겨레', '구독', '관련기사', '아티', '클관련', '추가', '지면보기',
'종합', '뉴스', '사진', '밝혔', '이라고', '등록', '라고', '라며', '내용', '보다', '경우', '지역', '위해', '이라는', '그런', '처럼', '이나', '같은', '보다', '는데', '다면', '그것', '이제',
'때문', '다시', '많은', '정도', '일이', '없었', '되었', '인가', '않는', '베스트추천', '기자', '수정']
data_words = [[item[0], item[1], [word for word in item[2] if word not in stop_words]] for item in data_words]

def get_topic(txt):
    corpus = id2word.doc2bow(txt)
    topic = list(lda_model.get_document_topics(corpus))
    return sorted(topic, key=itemgetter(1))[-1][0]

#return topics and the corresponding words and weights.
lda_model.print_topics()


#Drawing Graph

result = [[item[0], item[1], get_topic(item[2])] for item in data_words]
df = pd.DataFrame(result)
df.columns = ['news', 'date', 'topic']
df.date = df.date.astype(int)
df = df[df.date >= 199501]

#Remove errors in date information

for i in range(len(df)):
    if int(str(df['date'].iloc[i])[4:6]) > 12 or int(str(df['date'].iloc[i])[4:6]) == 0:
        print(i, df['date'].iloc[i])
for i in range(len(df)):
    try:
        x = df['date'].iloc[i]
        dt.date(int(str(x)[0:4]), int(str(x)[4:6]), int(str(x)[6:]))
    except:
        print(i, df['date'].iloc[i])

df.drop(df.index[[42981, 43438]], inplace = True)
df.drop(df.index[[65986, 74283]], inplace = True)

#Process date information  

df['date'] = df['date'].apply(lambda x: dt.date(int(str(x)[0:4]), int(str(x)[4:6]), int(str(x)[6:])))
df['date'] = pd.to_datetime(df['date'])

#Adjust sample ratio by putting different weight.

#day

count_d = df.groupby(['date', 'topic']).size().reset_index(name = 'count')
count_d['adj'] = count_d['count']
mask1 = (count_d['date'] < dt.date(2005, 1, 1))
mask2 = (dt.date(2005, 1, 1) < count_d['date']) & (count_d['date']< dt.date(2009, 10, 17))
count_d.loc[mask1, 'adj'] = count_d.loc[mask1, 'count'] * 2
count_d.loc[mask2, 'adj'] = count_d.loc[mask2, 'count'] * (4/3)

#month 

df_ = df.copy()
df_.reset_index()
df_['date'] = pd.to_datetime(df_['date'])
df_.set_index('date', inplace = True)
df_ = df_.to_period('M').to_timestamp('M')
count_m = df_.groupby(['date', 'topic']).size().reset_index(name = 'count')
count_m['adj'] = count_m['count']
mask1 = (count_m['date'] < dt.date(2005, 1, 1))
mask2 = (dt.date(2005, 1, 1) &lt; count_m[&#039;date&#039;]) &amp; (count_m[&#039;date&#039;]<dt>= dt.date(2000,1,1)) &amp; (count_d.date = dt.date(2000,1,1)) &amp; (count_m.date = dt.date(2000,1,1)) &amp; (count_y.date = dt.date(2018, 1, 1))
            count_y.loc[mask3, 'adj'] = count_y.loc[mask3, 'count'] * 6/5
        else:
            return None
        #color = '#00BFFF'
        x = xy['date']
        if adj:
            y = xy['adj']
        else:
            y = xy['count']
        #ax.plot(x, y, alpha = 0.8, c = color, linewidth=1.3)
        #print(label, topic)
        ax.plot(x, y, alpha = 0.7, linewidth=1.3, label = 'topic # : %s(%s)'%(label, str(topic)))
        ax.set_xlim(dt.date(2000, 1,1), dt.date(2018, 12, 31))
        #ax.set_xlim(min(x), max(x))
    #ax.set_title("Spread and GDP", fontsize = 20)
    ax.set_xlabel('year', fontsize = 20)
    ax.set_ylabel('Number of Ariticles' , fontsize= 24)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.grid(color='grey', linestyle='-', linewidth=0, alpha = 1)
    #ax.set_xticks([1995, 1997, 1999, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015, 2017])
    ax.tick_params(axis = 'both', labelsize = 17)
    ax.legend()
    #if period == 'Y':
    #    plt.xticks(np.arange(dt.date(2000, 12, 31), dt.date(2018, 12, 31), dt.timedelta(731)))
    #else:
    plt.xticks(np.arange(dt.date(2000, 1,1), dt.date(2018, 12, 31), dt.timedelta(731)))
    plt.show()

#Draw Graphs (Day, Month, Year frequency on provocation and peace topics respectively) 
dp([12, 16], ['provocation', 'nuclear'], 'D', adj = True)
dp([13, 14], ['South-North', 'Global'], 'D', adj = True)
dp([12, 16], ['provocation', 'nuclear'], 'M', adj = True)
dp([13, 14], ['South-North', 'Global'], 'M', adj = True)
dp([12, 16], ['provocatoin', 'nuclear'], 'Y', adj = True)
dp([13, 14], ['South-North', 'US-North'], 'Y', adj = True)


#Number of Total Articles

plt.style.use('seaborn-whitegrid')
fig, ax = plt.subplots(figsize = (9.5, 6.5), dpi = 100)
xy = df.groupby(['date']).size().reset_index(name = 'count')
xy = xy[xy['date'] &gt; dt.date(1995, 1, 1)]
#color = '#00BFFF'
x = xy['date']
y = xy['count']
#ax.plot(x, y, alpha = 0.8, c = color, linewidth=1.3<code>)
#print(label, topic)
ax.plot(x, y, alpha = 0.8, linewidth=1.3, label = 'total')
ax.set_xlim(min(x), max(x))
#ax.set_title("Spread and GDP", fontsize = 20)
ax.set_xlabel('year', fontsize = 20)
ax.set_ylabel('Number of Ariticles' , fontsize= 24)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.grid(color='grey', linestyle='-', linewidth=0, alpha = 1)
#ax.set_xticks([1995, 1997, 1999, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015, 2017])
#ax.tick_params(axis = 'both', labelsize = 17)
ax.legend()
plt.xticks(np.arange(min(x), max(x), dt.timedelta(730)))
plt.show()

[NK Provocation Index][0] Intro

  • North Korean’s military provocations and nuclear threats are likely to hamper Korean Economic Growth
  • Possible Channel : Increased risk lead to Investment, Saving to decrease
  • X(N.K. Provocation)  (–> X'(Investment, saving(consumption) rate)  –> Y(Economic Growth)
  • Identification 1 : Measuring the degree of N.K. Provocation by number of articles belong to ‘Provocation/Nuclear threats’ topic (LDA topic model)
  • Identification 2 : Causality? VAR may be helpful