[NK Provocation Index][1] Identification 1 – LDA model

CODE:

import re
import os
import sys
import pandas as pd
import numpy as np
import pandas as pd
from pprint import pprint
import random
import gensim
import gensim.corpora as corpora
from konlpy.tag import Twitter
from operator import itemgetter
import datetime as dt 
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

data = []
dirname = 'D://nk//data'
header = {}
k = 0
for f_dir in os.listdir(dirname):
    for fname in os.listdir(os.path.join(dirname, f_dir)):
        k += 1
        if k % 10000 == 0:
            print(k)
        f = open(os.path.join(dirname, f_dir, fname), 'r', encoding = 'utf-8')
        data.append([f_dir, f.read()])
        f.close()

#Document specific Preprocessing

a1 = re.compile('등록\s*\:\s*(\d{4}\s*\-\s*\d{2}\s*\-\s*\d{2})')
a2 = re.compile('입력\s*(\d{4}\s*[\.\-]\s*\d{2}\s*[\.\-]\s*\d{2})')
a2_2 = re.compile('(\d{4}\s*[\.\-]\s*\d{2}\s*[\.\-]\s*\d{2})')
a2_3 = re.compile('등록\s*\:\(\d{4}\-\d{2}\-\d{2})')
a3 = re.compile('((?:19|20)\d{6})')
a4 = re.compile('(\d{4}\s*\-\s*\d{2}\s*\-\s*\d{2})')
for item in data:
    k = 0
    for a in [a1, a2, a2_2, a2_3, a4, a3]:
        if a.search(item[1]):
            date = re.sub('[\s\-\.]', '', a.search(item[1])[1])
            k = 1
            data_date.append([item[0], date, item[1]])
            if len(str(date)) != 8:
                print(item[1][:100])
            break
    #if k == 0 :
        #print(item[1][:30])

twitter = Twitter()

def sent_to_words(sentences):
    return twitter.morphs(sentences)  # deacc=True removes punctuations

def remove_stopwords(texts):
    return [[word for word in preprocess(str(doc)) if word not in stopwords] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def preprocess(doc):
    doc = re.sub('\s+', ' ', doc)
    doc = re.sub('[A-Za-z]+[0-9]+', '', doc)
    doc = re.sub('[a-zA-Z]+', ' ', doc)
    doc = re.sub('\s+', ' ', doc)
    return doc

#Remove all the one character words except for the name of country

country_list = ['미', '북', '러', '중', '일', '한', '군', '핵', '당', '말', '남']
data = [[re.sub('[^가-힣\s\_]', '', word) for word in item] for item in data]
data = [[word for word in item if (len(word) > 1) or (word in country_list)] for item in data]

data_words = list(zip(header, data))

#Remove document specific stopwords

stop_words = ['아티클', '중앙일보', '조선일보', '동아일보', '한겨레', '구독', '관련기사', '아티', '클관련', '추가', '지면보기',
'종합', '뉴스', '사진', '밝혔', '이라고', '등록', '라고', '라며', '내용', '보다', '경우', '지역', '위해', '이라는', '그런', '처럼', '이나', '같은', '보다', '는데', '다면', '그것', '이제',
'때문', '다시', '많은', '정도', '일이', '없었', '되었', '인가', '않는', '베스트추천', '기자', '수정']
data_words = [[item[0], item[1], [word for word in item[2] if word not in stop_words]] for item in data_words]

def get_topic(txt):
    corpus = id2word.doc2bow(txt)
    topic = list(lda_model.get_document_topics(corpus))
    return sorted(topic, key=itemgetter(1))[-1][0]

#return topics and the corresponding words and weights.
lda_model.print_topics()


#Drawing Graph

result = [[item[0], item[1], get_topic(item[2])] for item in data_words]
df = pd.DataFrame(result)
df.columns = ['news', 'date', 'topic']
df.date = df.date.astype(int)
df = df[df.date >= 199501]

#Remove errors in date information

for i in range(len(df)):
    if int(str(df['date'].iloc[i])[4:6]) > 12 or int(str(df['date'].iloc[i])[4:6]) == 0:
        print(i, df['date'].iloc[i])
for i in range(len(df)):
    try:
        x = df['date'].iloc[i]
        dt.date(int(str(x)[0:4]), int(str(x)[4:6]), int(str(x)[6:]))
    except:
        print(i, df['date'].iloc[i])

df.drop(df.index[[42981, 43438]], inplace = True)
df.drop(df.index[[65986, 74283]], inplace = True)

#Process date information  

df['date'] = df['date'].apply(lambda x: dt.date(int(str(x)[0:4]), int(str(x)[4:6]), int(str(x)[6:])))
df['date'] = pd.to_datetime(df['date'])

#Adjust sample ratio by putting different weight.

#day

count_d = df.groupby(['date', 'topic']).size().reset_index(name = 'count')
count_d['adj'] = count_d['count']
mask1 = (count_d['date'] < dt.date(2005, 1, 1))
mask2 = (dt.date(2005, 1, 1) < count_d['date']) & (count_d['date']< dt.date(2009, 10, 17))
count_d.loc[mask1, 'adj'] = count_d.loc[mask1, 'count'] * 2
count_d.loc[mask2, 'adj'] = count_d.loc[mask2, 'count'] * (4/3)

#month 

df_ = df.copy()
df_.reset_index()
df_['date'] = pd.to_datetime(df_['date'])
df_.set_index('date', inplace = True)
df_ = df_.to_period('M').to_timestamp('M')
count_m = df_.groupby(['date', 'topic']).size().reset_index(name = 'count')
count_m['adj'] = count_m['count']
mask1 = (count_m['date'] < dt.date(2005, 1, 1))
mask2 = (dt.date(2005, 1, 1) &lt; count_m[&#039;date&#039;]) &amp; (count_m[&#039;date&#039;]<dt>= dt.date(2000,1,1)) &amp; (count_d.date = dt.date(2000,1,1)) &amp; (count_m.date = dt.date(2000,1,1)) &amp; (count_y.date = dt.date(2018, 1, 1))
            count_y.loc[mask3, 'adj'] = count_y.loc[mask3, 'count'] * 6/5
        else:
            return None
        #color = '#00BFFF'
        x = xy['date']
        if adj:
            y = xy['adj']
        else:
            y = xy['count']
        #ax.plot(x, y, alpha = 0.8, c = color, linewidth=1.3)
        #print(label, topic)
        ax.plot(x, y, alpha = 0.7, linewidth=1.3, label = 'topic # : %s(%s)'%(label, str(topic)))
        ax.set_xlim(dt.date(2000, 1,1), dt.date(2018, 12, 31))
        #ax.set_xlim(min(x), max(x))
    #ax.set_title("Spread and GDP", fontsize = 20)
    ax.set_xlabel('year', fontsize = 20)
    ax.set_ylabel('Number of Ariticles' , fontsize= 24)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.grid(color='grey', linestyle='-', linewidth=0, alpha = 1)
    #ax.set_xticks([1995, 1997, 1999, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015, 2017])
    ax.tick_params(axis = 'both', labelsize = 17)
    ax.legend()
    #if period == 'Y':
    #    plt.xticks(np.arange(dt.date(2000, 12, 31), dt.date(2018, 12, 31), dt.timedelta(731)))
    #else:
    plt.xticks(np.arange(dt.date(2000, 1,1), dt.date(2018, 12, 31), dt.timedelta(731)))
    plt.show()

#Draw Graphs (Day, Month, Year frequency on provocation and peace topics respectively) 
dp([12, 16], ['provocation', 'nuclear'], 'D', adj = True)
dp([13, 14], ['South-North', 'Global'], 'D', adj = True)
dp([12, 16], ['provocation', 'nuclear'], 'M', adj = True)
dp([13, 14], ['South-North', 'Global'], 'M', adj = True)
dp([12, 16], ['provocatoin', 'nuclear'], 'Y', adj = True)
dp([13, 14], ['South-North', 'US-North'], 'Y', adj = True)


#Number of Total Articles

plt.style.use('seaborn-whitegrid')
fig, ax = plt.subplots(figsize = (9.5, 6.5), dpi = 100)
xy = df.groupby(['date']).size().reset_index(name = 'count')
xy = xy[xy['date'] &gt; dt.date(1995, 1, 1)]
#color = '#00BFFF'
x = xy['date']
y = xy['count']
#ax.plot(x, y, alpha = 0.8, c = color, linewidth=1.3<code>)
#print(label, topic)
ax.plot(x, y, alpha = 0.8, linewidth=1.3, label = 'total')
ax.set_xlim(min(x), max(x))
#ax.set_title("Spread and GDP", fontsize = 20)
ax.set_xlabel('year', fontsize = 20)
ax.set_ylabel('Number of Ariticles' , fontsize= 24)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.grid(color='grey', linestyle='-', linewidth=0, alpha = 1)
#ax.set_xticks([1995, 1997, 1999, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015, 2017])
#ax.tick_params(axis = 'both', labelsize = 17)
ax.legend()
plt.xticks(np.arange(min(x), max(x), dt.timedelta(730)))
plt.show()

[NK Provocation Index][0] Intro

  • North Korean’s military provocations and nuclear threats are likely to hamper Korean Economic Growth
  • Possible Channel : Increased risk lead to Investment, Saving to decrease
  • X(N.K. Provocation)  (–> X'(Investment, saving(consumption) rate)  –> Y(Economic Growth)
  • Identification 1 : Measuring the degree of N.K. Provocation by number of articles belong to ‘Provocation/Nuclear threats’ topic (LDA topic model)
  • Identification 2 : Causality? VAR may be helpful

[NKPR 0] Building Caffe on Window (for anaconda environment)

I recall that installing Caffe on Window was one of the hardest steps on this project.

 

  • However,  some (small) problems arises depending on the different environment one has.  For me, installing VS 2015 raised error ;  a setup package is either missing or damaged, but no perfect help for this problem exists on the web. (Spend two days repeating shredding the whole VS 2015/reinstalling)

 

  • In addition, building PyCaffe requires python 3.5, while I have been using python 3.6 (anaconda) for my previous works. Since I do not want to change my working environment, I tried to install PyCaffe using anaconda environment setting(python 3.5). There are some settings that should be modified before installing.

 

  1. Create new environment for python 3.5. (e.g. conda create -n py35 python = 3.5.0 anaconda)
  2. Before using cmd, call the anaconda environment (e.g. conda activate py35)
  3. When modifying caffe\caffe\scripts\build_win.cmd according to the video above, set CONDA_ROOT variable as location to the python 3.5. environment conda
  4. Now follow the video!
  5. Done!

[USVC] Drawing Supply Chain 2 – US Listed Domestic Firms

이 슬라이드 쇼에는 JavaScript가 필요합니다.

<Histogram of firm out nodes, clockwise from left top: 2000, 2005, 2010, 2015>

numbers

  • Total Sample number decreased through 1998 to 2016:{1998: 9062, 1999: 8906, 2000: 8512, 2001: 8167, 2002: 7692, 2003: 7447, 2004: 7498, 2005: 7015, 2006: 6690, 2007: 6676, 2008: 7448, 2009: 7792, 2010: 7427, 2011: 7223, 2012: 6943, 2013: 6783, 2014: 6640, 2015: 6231, 2016: 5850}
  • The number of edges(linking firms), however, further decreased in the same period
  • The average shortest length of all possible linkages: {2000: 1.638, 2005: 1.531, 2010: 1.284, 2015: 1.322}

 

Possible explanations

  • The (trained natural language) model may be over-fitted to early 2000’s
  • Supply Chain among U.S. firms might be actually decreasing due to economic uncertainty

 

10 Firms with the most in-nodes

(year 2000) : [[‘Walmart Inc’, 0.027717626678215677], [‘Lucent Technologies Inc’, 0.026634906886097875], [‘Hewlett Packard Enterprise Co’, 0.023170203551320916], [‘AT&T Corp’, 0.018189692507579038], [‘Ford Inc’, 0.0173235166738848], [‘Cisco Systems Inc’, 0.01602425292334344], [‘Siemens AG’, 0.013858813339107838], [‘Boeing Corp’, 0.013642269380684278], [‘Intel Corp’, 0.012126461671719359], [‘Target Inc’, 0.01169337375487224]]

(year 2015) : [[‘Walmart Inc’, 0.020942408376963352], [‘AT&T Corp’, 0.010732984293193719], [‘Ford Inc’, 0.010209424083769635], [‘Shell Oil Co’, 0.009947643979057593], [‘Target Inc’, 0.00968586387434555], [‘Home Depot Inc. ‘, 0.009162303664921467], [‘Cisco Inc’, 0.008638743455497384], [‘Microsoft Corp’, 0.008638743455497384]]

 

 

[USVC] Drawing Supply Chain 1 – Small Sample

/******************************************************************
— Title : [Python; NetworkX] Supply Chain analysis
— Key word : networkx, Node, Edge, Centrality, Supply Chain, Value Chain
*******************************************************************/

Data

  • About 200 major firms listed on Compustat data
  • Data Set will soon encompass all the firms with CIK code
  • Customer information extracted from 10-k disclosure data

Graph

  • Drawn from the basic networkx graph tool (nx.draw())
  • year : ordered in years ; 2000, 2005, 2010, 2015
  • Size of node : in_degree_centrality
  • Color of node : out_degree_centrality

2000(2)2005(2)2010(2)2015(2)

Sample Code :

(Reference : https://briandew.wordpress.com/2016/06/15/trade-network-analysis-why-centrality-matters/)

import networkx as nx
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

def draw_G(G, year):
    oc = nx.out_degree_centrality(G)
    for key in oc.keys():
        oc[key] = oc[key]*10
    nx.set_node_attributes(G, name= 'cent', values = oc)
    ic = nx.in_degree_centrality(G)
    nx.set_node_attributes(G, name= 'in', values = ic)
    node_size = [float(G.node[v]['in'])*20000 + 1 for v in G]
    node_color = [float(G.node[v]['cent']) for v in G]
    pos = nx.spring_layout(G, k=30, iterations=8)
    nodes = nx.draw_networkx_nodes(G, pos, node_size=node_size, node_color = node_color, alpha=0.5)
#nodes = nx.draw_networkx_nodes(G, pos, node_color=node_color, alpha=0.5)
    edges = nx.draw_networkx_edges(G, pos, edge_color='black', arrows=True, width=0.3)
    nx.draw_networkx_labels(G, pos, font_size=5)
    plt.text(0,-1.2, 'Node color is out_degree_centrality', fontsize=7)
    plt.title('Compustat firms Supply Chain (year : ' + str(year) + ')', fontsize=12)
    cbar = plt.colorbar(mappable=nodes, cax=None, ax=None, fraction=0.015, pad=0.04)
    cbar.set_clim(0, 1)
    plt.margins(0,0)
    plt.axis('off')
    plt.savefig(str(year)+ 'Supply Chain.png', dpi=1000)
    plt.show()

Numbers(Statistics)

  • Longest path :

[USVC 2] CLASSIFYING TEXT

I found that customer information is stated in two forms.

One is in the sentence type and the other is the table type.

Hence, I started from dividing the 10-k text into two categories; text and table. (by using html tags)

The methods to deal with them , however, are similar: TEXT CLASSIFICATION

*GOAL 1 (sentence form):

Classifying sentences whether they are relevant to the customer information or not.

Example Sentences:

Net sales to the Company’s three major customers, Staples, Inc., Office Max, and United Stationers, Inc., represented approximately 43% in 2004, 46% in 2003 and 46% in 2002.

For fiscal 2003, Fujitsu accounted for approximately 31 percent of our consolidated accounts receivable and approximately 13 percent of our consolidated gross sales.

In 2004, Matyep in Mexico represented 11.0.% of our consolidated revenues and Burlington Resources Inc. represented 10.1%.

Fleetwood was the Company’s largest customer in 2004, representing approximately 31% of total sales.

I hoped that there would be some rules or sentence structures that can cover the whole customer information in 10-k. I tried manually finding those rules, ended up finding 24 kinds of sentences. Although they can help me find every sentences that contains customer information listed on Compustat data(used as a reference point during my whole research), some of the sentences filtered by those 24 rules are have nothing to do with the revenue information.

To get rid of those irrelevant sentences I adopted the machine learning techniques.

*Annotation (GOLD-STANDARD);

-prodigy

*spaCy

 

*Scikit-Learn