Predicting Terrorism

In [1]:
import pandas as pd
import glob
import pandas as pd
import numpy as np

db = '/Users/jennydoyle/Desktop/dsi/04-project-assets/globalterrorismdb_0616dist.csv'
df = pd.read_csv(db, header=0,low_memory=False)
print 'Size = ', df.shape
Size =  (156772, 137)

I. EDA

In [2]:
def eda(dataframe):
    from IPython.core import display as ICD
    print "dataframe shape: ",dataframe.shape      ## rows by columns
    print ''
    print 'num duplicates:', dataframe.duplicated().sum() ## df.drop_duplicates() to remove dupes
    print ''
    print "pct missing values \n"
    ICD.display(pd.DataFrame(dataframe.isnull().sum()/len(df)*100),columns=['Columns','Pct Missing']), ## count number of null values per column
    print ''
    print "dataframe types \n"
    ICD.display(pd.DataFrame(dataframe.dtypes),columns=['Columns','Dtype'])       ## list data type of each column
    print ''
    print "dataframe describe \n"
    ICD.display(pd.DataFrame(dataframe.describe()))## stats -- mean, min, max, etc.. 
    print ''
    print '# unique values in series:\n'
    u_list = []
    for item in dataframe:                             ## count number of unique values per column
        u_list.append([item, dataframe[item].nunique()])
    unique_vals = pd.DataFrame(u_list,columns=['Column','Num_unique'])
    ICD.display(unique_vals)

eda(df)
dataframe shape:  (156772, 137)

num duplicates: 0

pct missing values 

0
eventid 0.000000
iyear 0.000000
imonth 0.000000
iday 0.000000
approxdate 96.966295
extended 0.000000
resolution 97.766183
country 0.000000
country_txt 0.000000
region 0.000000
region_txt 0.000000
provstate 9.261858
city 0.284490
latitude 2.882530
longitude 2.882530
specificity 0.000000
vicinity 0.000000
location 73.074911
summary 42.188656
crit1 0.000000
crit2 0.000000
crit3 0.000000
doubtterr 0.000638
alternative 84.540607
alternative_txt 0.000000
multiple 0.000000
success 0.000000
suicide 0.000000
attacktype1 0.000000
attacktype1_txt 0.000000
... ...
propextent 64.054806
propextent_txt 0.000000
propvalue 80.027046
propcomment 68.475238
ishostkid 0.113541
nhostkid 92.812492
nhostkidus 92.847575
nhours 97.893757
ndays 95.801546
divert 99.815656
kidhijcountry 97.901411
ransom 52.101141
ransomamt 99.237747
ransomamtus 99.737836
ransompaid 99.602608
ransompaidus 99.743577
ransomnote 99.731457
hostkidoutcome 94.460108
hostkidoutcome_txt 0.000000
nreleased 94.836450
addnotes 86.015360
scite1 42.309851
scite2 60.987294
scite3 78.228255
dbsource 0.000000
INT_LOG 0.000000
INT_IDEO 0.000000
INT_MISC 0.000000
INT_ANY 0.000000
related 86.973439

137 rows × 1 columns

dataframe types 

0
eventid int64
iyear int64
imonth int64
iday int64
approxdate object
extended int64
resolution object
country int64
country_txt object
region int64
region_txt object
provstate object
city object
latitude float64
longitude float64
specificity int64
vicinity int64
location object
summary object
crit1 int64
crit2 int64
crit3 int64
doubtterr float64
alternative float64
alternative_txt object
multiple int64
success int64
suicide int64
attacktype1 int64
attacktype1_txt object
... ...
propextent float64
propextent_txt object
propvalue float64
propcomment object
ishostkid float64
nhostkid float64
nhostkidus float64
nhours float64
ndays float64
divert object
kidhijcountry object
ransom float64
ransomamt float64
ransomamtus float64
ransompaid float64
ransompaidus float64
ransomnote object
hostkidoutcome float64
hostkidoutcome_txt object
nreleased float64
addnotes object
scite1 object
scite2 object
scite3 object
dbsource object
INT_LOG int64
INT_IDEO int64
INT_MISC int64
INT_ANY int64
related object

137 rows × 1 columns

dataframe describe 

eventid iyear imonth iday extended country region latitude longitude specificity ... ransomamt ransomamtus ransompaid ransompaidus hostkidoutcome nreleased INT_LOG INT_IDEO INT_MISC INT_ANY
count 1.567720e+05 156772.000000 156772.000000 156772.000000 156772.000000 156772.000000 156772.000000 152253.000000 152253.000000 156772.000000 ... 1.195000e+03 4.110000e+02 6.230000e+02 402.000000 8685.000000 8095.000000 156772.000000 156772.000000 156772.000000 156772.000000
mean 2.000541e+11 2000.474083 6.484666 15.455215 0.041347 133.087401 6.970097 23.190988 24.210467 1.452632 ... 3.320127e+06 5.454451e+05 4.319721e+05 305.196517 4.592170 -27.788635 -4.834645 -4.789114 0.093894 -4.221124
std 1.298283e+09 12.982397 3.392225 8.815533 0.199091 113.946290 2.967803 19.220723 59.900831 1.016971 ... 3.187694e+07 6.665967e+06 2.589893e+06 3409.027685 2.049184 58.524976 4.528862 4.589779 0.602442 4.686143
min 1.970000e+11 1970.000000 0.000000 0.000000 0.000000 4.000000 1.000000 -53.154613 -176.176447 1.000000 ... -9.900000e+01 -9.900000e+01 -9.900000e+01 -99.000000 1.000000 -99.000000 -9.000000 -9.000000 -9.000000 -9.000000
25% 1.989082e+11 1989.000000 4.000000 8.000000 0.000000 69.000000 5.000000 10.756961 -1.929857 1.000000 ... 0.000000e+00 0.000000e+00 -9.900000e+01 0.000000 2.000000 -99.000000 -9.000000 -9.000000 0.000000 -9.000000
50% 2.005071e+11 2005.000000 6.000000 15.000000 0.000000 101.000000 6.000000 31.285506 41.919647 1.000000 ... 1.250000e+04 0.000000e+00 0.000000e+00 0.000000 4.000000 0.000000 -9.000000 -9.000000 0.000000 0.000000
75% 2.013060e+11 2013.000000 9.000000 23.000000 0.000000 160.000000 10.000000 34.842222 68.416974 1.000000 ... 4.115000e+05 0.000000e+00 4.276840e+03 0.000000 7.000000 1.000000 0.000000 0.000000 0.000000 0.000000
max 2.015123e+11 2015.000000 12.000000 31.000000 1.000000 1004.000000 12.000000 74.633553 179.366667 5.000000 ... 1.000000e+09 1.320000e+08 4.100000e+07 48000.000000 7.000000 1201.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 79 columns

# unique values in series:

Column Num_unique
0 eventid 156772
1 iyear 45
2 imonth 13
3 iday 32
4 approxdate 1426
5 extended 2
6 resolution 2657
7 country 206
8 country_txt 206
9 region 12
10 region_txt 12
11 provstate 2509
12 city 31324
13 latitude 52021
14 longitude 51632
15 specificity 5
16 vicinity 3
17 location 35794
18 summary 34335
19 crit1 2
20 crit2 2
21 crit3 2
22 doubtterr 3
23 alternative 5
24 alternative_txt 6
25 multiple 2
26 success 2
27 suicide 2
28 attacktype1 9
29 attacktype1_txt 9
... ... ...
107 propextent 4
108 propextent_txt 5
109 propvalue 604
110 propcomment 17437
111 ishostkid 3
112 nhostkid 221
113 nhostkidus 28
114 nhours 34
115 ndays 289
116 divert 142
117 kidhijcountry 217
118 ransom 3
119 ransomamt 350
120 ransomamtus 21
121 ransompaid 122
122 ransompaidus 8
123 ransomnote 285
124 hostkidoutcome 7
125 hostkidoutcome_txt 8
126 nreleased 155
127 addnotes 9613
128 scite1 66698
129 scite2 50118
130 scite3 28489
131 dbsource 26
132 INT_LOG 3
133 INT_IDEO 3
134 INT_MISC 3
135 INT_ANY 3
136 related 18082

137 rows × 2 columns

Define graph settings

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 8)
plt.rcParams['font.size'] = 30
plt.rcParams['font.family'] = 'Courier New, monospace'
plt.rcParams['xtick.labelsize'] = 20 
plt.rcParams['ytick.labelsize'] = 20 

Exploring the Data

GOAL: Find something interesting to use in a Bayesian Inference

Types and Counts of Attack Types

In [4]:
df.attacktype1_txt.value_counts()
Out[4]:
Bombing/Explosion                      75963
Armed Assault                          37554
Assassination                          17582
Hostage Taking (Kidnapping)             9115
Facility/Infrastructure Attack          8849
Unknown                                 5490
Hostage Taking (Barricade Incident)      835
Unarmed Assault                          828
Hijacking                                556
Name: attacktype1_txt, dtype: int64

So Many Graphs

In [81]:
plt.hist(df.iyear,bins=46)
plt.title('Number of Attacks by Year',fontsize=30)
plt.show()
In [9]:
import seaborn as sns
plt.rcParams['figure.figsize'] = (10, 8)
# plt.rcParams['font.size'] = 30
plt.rcParams['font.family'] = 'Courier New, monospace'
plt.rcParams['xtick.labelsize'] = 10 
# plt.rcParams['ytick.labelsize'] = 20 
sns.countplot(x="attacktype1_txt", data=df)
plt.title('Number of Attacks by Type',size=30)
plt.xticks(rotation=70)
plt.show()
In [19]:
attacks_by_type = df[['iyear','attacktype1_txt','attacktype1']].groupby(['iyear','attacktype1_txt']).attacktype1.count().reset_index()
attacks_by_type.rename(columns={'attacktype1_txt':'Attack Type'},inplace=True)
attacks_by_type.set_index('iyear',inplace=True)

for atype in list(df.attacktype1_txt.unique()):
    plt.plot(attacks_by_type[['attacktype1']][attacks_by_type['Attack Type']==atype],label=atype,linewidth=3.0)
    # plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Terror Attacks by Type',fontsize=30)
plt.legend(prop={'size':15})
plt.show()
In [80]:
attacks_by_type = df[['iyear','attacktype1_txt','attacktype1']][df.suicide==1].groupby(['iyear','attacktype1_txt']).attacktype1.count().reset_index()
attacks_by_type.rename(columns={'attacktype1_txt':'Attack Type'},inplace=True)
attacks_by_type.set_index('iyear',inplace=True)

for atype in list(df.attacktype1_txt.unique()):
    plt.plot(attacks_by_type[['attacktype1']][attacks_by_type['Attack Type']==atype],label=atype,linewidth=3.0)
    # plt.plot(attacks_by_type,hue='Attack Type')
plt.rcParams['figure.figsize'] = (10, 8)
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Suicide Attacks by Type',fontsize=30)
plt.legend(prop={'size':15})
plt.show()
In [21]:
df['weaptype1_txt'].replace({'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)':'Vehicle'},inplace=True)

attacks_by_weapon = df[['iyear','weaptype1_txt','weaptype1']].groupby(['iyear','weaptype1_txt']).weaptype1.count().reset_index()
attacks_by_weapon.rename(columns={'weaptype1_txt':'Weapon'},inplace=True)
attacks_by_weapon.set_index('iyear',inplace=True)

attacks_by_weapon

for atype in list(df.weaptype1_txt.unique()):
    plt.plot(attacks_by_weapon[['weaptype1']][attacks_by_weapon['Weapon']==atype],label=atype,linewidth=3.0)
    # plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Terror Attacks by Weapon',fontsize=30)
plt.legend(prop={'size':15})
plt.show()
In [22]:
df['weaptype1_txt'].replace({'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)':'Vehicle'},inplace=True)

attacks_by_weapon = df[['iyear','weaptype1_txt','weaptype1']][df.suicide==1].groupby(['iyear','weaptype1_txt']).weaptype1.count().reset_index()
attacks_by_weapon.rename(columns={'weaptype1_txt':'Weapon'},inplace=True)
attacks_by_weapon.set_index('iyear',inplace=True)

attacks_by_weapon

for atype in list(df.weaptype1_txt.unique()):
    plt.plot(attacks_by_weapon[['weaptype1']][attacks_by_weapon['Weapon']==atype],label=atype,linewidth=3.0)
    # plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Suicide Attacks by Weapon',fontsize=30)
plt.legend(prop={'size':15})
plt.show()
In [15]:
import seaborn as sns
# plt.rcParams['figure.figsize'] = (16, 14)
plt.rcParams['font.size'] = 30
plt.rcParams['font.family'] = 'Courier New, monospace'
plt.rcParams['xtick.labelsize'] = 20 
plt.rcParams['ytick.labelsize'] = 20 
countries = df.groupby(['country_txt']).attacktype1.count().reset_index().sort_values('attacktype1',ascending=False)[0:10]
countries = list(countries.country_txt)
countries.append('United States')
mask = df[df.country_txt.isin(countries)]
sns.countplot(x="country_txt", data=mask)
plt.title('Number of Attacks by Country',size=30)
plt.xticks(rotation=70)
plt.show()
In [23]:
attacks_by_region = df[['iyear','region_txt','region']].groupby(['iyear','region_txt']).region.count().reset_index()
attacks_by_region.rename(columns={'region_txt':'Region'},inplace=True)
attacks_by_region.set_index('iyear',inplace=True)

for atype in list(df.region_txt.unique()):
    plt.plot(attacks_by_region[['region']][attacks_by_region['Region']==atype],label=atype,linewidth=3.0)
    # plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Terror Attacks by Region',fontsize=30)
plt.legend(prop={'size':15})
plt.show()
In [24]:
attacks_by_region = df[['iyear','region_txt','region']][df.suicide==1].groupby(['iyear','region_txt']).region.count().reset_index()
attacks_by_region.rename(columns={'region_txt':'Region'},inplace=True)
attacks_by_region.set_index('iyear',inplace=True)

for atype in list(df.region_txt.unique()):
    plt.plot(attacks_by_region[['region']][attacks_by_region['Region']==atype],label=atype,linewidth=3.0)
    # plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Suicide Attacks by Region',fontsize=30)
plt.legend(prop={'size':15})
plt.show()
In [10]:
from matplotlib import pyplot as plt

# get total attacks per year
attacks_by_year = df[['iyear','eventid']].groupby(['iyear']).eventid.count().reset_index()
attacks_by_year.rename(columns={'eventid':'Total Attacks'},inplace=True)

# get number of suicide attacks per year
suicide_by_year = df[['iyear','suicide']].groupby(['iyear']).suicide.sum().reset_index()
# get number of successful attacks per year
successes_by_year = df[['iyear','success']].groupby(['iyear']).success.sum().reset_index()
# get number of suicide attacks per year
nperps_by_year = df[['iyear','nperps']][df.nperps!=-99].groupby(['iyear']).nperps.sum().reset_index()
# get number killed per year
killed_by_year = df[['iyear','nkill']].groupby(['iyear']).nkill.sum().reset_index()
# get number wounded per year
wounded_by_year = df[['iyear','nwound']].groupby(['iyear']).nwound.sum().reset_index()
# get number wounded per year
propval_by_year = df[['iyear','propvalue']].groupby(['iyear']).propvalue.sum().reset_index()

# combine into one df
attacks_by_year = attacks_by_year.merge(suicide_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(successes_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(nperps_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(killed_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(wounded_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(propval_by_year,left_on='iyear',right_on='iyear')

attacks_by_year.rename(columns={'eventid':'Total Attacks',
                                'suicide':'Suicide Attacks',
                                'success':'Successful Attacks',
#                                 'nperps':'Number of Perpetrators',
                                'nkill':'Number Killed',
                                'nwound':'Number Wounded'},inplace=True)
attacks_by_year.set_index('iyear',inplace=True)

plt.plot(attacks_by_year['Total Attacks'], label="Total Attacks",linewidth=3.0)
plt.plot(attacks_by_year['Suicide Attacks'], label="Suicide Attacks",linewidth=3.0)
plt.plot(attacks_by_year['Successful Attacks'], label="Successful Attacks",linewidth=3.0)
# plt.plot(attacks_by_year['Number of Perpetrators'], label="Number of Perps")
plt.plot(attacks_by_year['Number Killed'], label="Number Wounded",linewidth=3.0)
plt.plot(attacks_by_year['Number Wounded'], label="Number Killed",linewidth=3.0)
# plt.plot(attacks_by_year['Total Property Value'], label="Total Property Value")

plt.xlabel('Year',fontsize=30)
plt.ylabel('Number of Attacks')
plt.title('Total Terror Attacks',fontsize=30)
plt.legend(prop={'size':20})
plt.show()

AWESOME

Up until now, most of the line graphs have looked pretty similar, but check out the green Suicide Attacks line. It looks nearly constant. However, towards the more recent years, there's a slight trend. I want to investigate this further. Now I don't regret writing up all of this code in the last block.

II. Bayesian Inference

After looking at countless graphs, I've decided to look more closely into suicide attacks for my Bayesian estimation.

I'm going to see if there's a statistically significant difference between suicide attacks pre 9/11 and post 9/11.

I will do this by comparing number of suicide attacks per year for 10 years before 9/11 and 10 years after 9/11.

In [36]:
# compare distributions

plt.hist(df.iyear[(df.iyear>=1990)&(df.iyear<2001)],bins=46)
plt.title('Number of Attacks by Year--Before 9/11',fontsize=30)
plt.show()
plt.hist(df.iyear[(df.iyear>2001)&(df.iyear<=2011)],bins=46)
plt.title('Number of Attacks by Year--After 9/11',fontsize=30)
Out[36]:
<matplotlib.text.Text at 0x123655fd0>
In [44]:
print 'Avg # suicide attacks before 9/11 =', round(df.iyear[(df.iyear>=1990)&(df.iyear<2001)].value_counts().mean(),2),'; total =', df.iyear[(df.iyear>=1990)&(df.iyear<2001)].value_counts().sum()
print 'Avg # suicide attacks after 9/11 =', round(df.iyear[(df.iyear>2001)&(df.iyear<=2011)].value_counts().mean(),2), '; total =', df.iyear[(df.iyear>2001)&(df.iyear<=2011)].value_counts().sum()
Avg # suicide attacks before 9/11 = 3058.0 ; total = 30580
Avg # suicide attacks after 9/11 = 3115.4 ; total = 31154

So they're both pretty high averages, but I can't say with confidence whether or not the difference is statistically significant. Let's try to figure that out.

In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pymc3 as pm
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Couldn't import dot_parser, loading of dot files will not be possible.

Determine the Prior

QUESTION: By comparing terror attacks before and after 9/11, can we say that the 9/11 attack resulted in a statistically significant difference in number of suicide attacks?

As this was one of the deadliest attacks in hugely populated areas of the US, and was broadly publicized, I would assume that the number of suicide attacks would increase sadly due to popularization of radical extremist groups.

In [22]:
# df with only the suicide attacks
base = df[df.suicide==1]

# df for prior
df_prior = df[df.iyear<1990].iyear.value_counts()

# df for both of the time periods I'm comparing
df_before = df[(df.iyear<2001)&(df.iyear>=1990)].iyear.value_counts()
df_after = df[(df.iyear>2001)&(df.iyear<=2011)].iyear.value_counts()

Set up Prior Distribution

In [43]:
# set up bayesian model and priors on means
mean_prior_mean = df_prior.mean()
mean_prior_std  = df_prior.std()

print 'mean =', round(mean_prior_mean,2)
print 'std =', round(mean_prior_std,2)
mean = 2049.8
std = 1256.56
In [24]:
std_prior_lower = 0.01
std_prior_upper = 100.0

# set the prior distribution to both time periods before and after 9/11 
# prior belief that the number of suicide attacks is the same
with pm.Model() as model:

    before_mean = pm.Normal('before_mean', mean_prior_mean, sd=mean_prior_std)
    after_mean = pm.Normal('after_mean', mean_prior_mean, sd=mean_prior_std)

    before_std = pm.Uniform('before_std', lower=std_prior_lower, upper=std_prior_upper)
    after_std = pm.Uniform('after_std', lower=std_prior_lower, upper=std_prior_upper)

Set up Posterior Distribution

In [25]:
# using the prior distribution, we will set up the posterior distribution
# so using before/after_means, stds, and then adding the observed suicide attacks 
# in both the time periods before and after 9/11

with model:

    posterior_before = pm.Normal('Before_9_11', mu=before_mean, sd=before_std, observed=df_before)
    posterior_after = pm.Normal('After_9_11)', mu=after_mean, sd=after_std, observed=df_after)

Gather Stats on Posterior Distributions

In [26]:
with model:

    diff_of_means = pm.Deterministic('difference of means', after_mean - before_mean)
    diff_of_stds = pm.Deterministic('difference of stds', after_std - before_std)
    effect_size = pm.Deterministic('effect size',
                                   diff_of_means / np.sqrt((after_std**2 + before_std**2) / 2))
In [27]:
with model:
    trace = pm.sample(25000, njobs=4)
Auto-assigning NUTS sampler...
Initializing NUTS using advi...
Average ELBO = -3,116.6: 100%|██████████| 200000/200000 [00:29<00:00, 6700.30it/s]
Finished [100%]: Average ELBO = -3,104.3
100%|██████████| 25000/25000 [02:25<00:00, 198.27it/s]
In [28]:
pm.traceplot(trace);
# these colors are horrendous 
In [29]:
pm.plot_posterior(trace[3000:],
                  varnames=['after_mean', 'before_mean', 'before_std', 'after_std'],
                  color='#87ceeb');
In [30]:
pm.plot_posterior(trace[3000:],
                  varnames=['difference of means', 'difference of stds', 'effect size'],
                  ref_val=0,
                  color='#87ceeb');
In [31]:
pm.summary(trace[3000:],
           varnames=['difference of means', 'difference of stds', 'effect size'])
difference of means:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  57.335           44.451           0.152            [-29.818, 143.810]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  -29.719        27.264         57.399         87.175         144.000


difference of stds:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  0.019            0.073            0.000            [-0.123, 0.181]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  -0.116         -0.020         0.010          0.052          0.190


effect size:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  0.574            0.445            0.002            [-0.298, 1.439]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  -0.297         0.273          0.574          0.872          1.441

Bayesian Inference: Interpretation

It appears as though the difference in suicide attacks/year before and after 9/11 is not statistically significant. I guess it's a positive thing that 9/11 didn't lead to a huge increase in suicide attacks, but it says nothing about other types of attacks.

III. Impute 1993 Values

The codebook for the dataset describes how the poor handling of the data over time has led to the loss of some of the 1993 data. It is therefore excluded from the dataset as to not provide a false representation of the actual distribution.

The instruction was to impute the number of 1993 bombings, but I'll also do it for the other attack types.

Set all dates to Jan 1st of respective year -- I found that this was the only way I could get the ARMA model to work

In [3]:
# Compile list of new dates, setting new date to the first day of the year
dates = []
for i in range(0,len(df)):
    dates.append(str(df.iyear.loc[i])+'-01-01')

    
# create a clean date for the actual/real date
dates_real = []
for i in range(0,len(df)):
    month = df.imonth.loc[i]
    day = df.iday.loc[i]
    if month==0:
        month = 1
    if day==0:
        day = 1
    dates_real.append(str(df.iyear.loc[i])+'-'+str(month)+'-'+str(day))


# Add Date series to df
df['Date'] = pd.Series(dates)
df.Date = pd.to_datetime(df.Date)
df['Date_real'] = pd.Series(dates_real)
df.Date_real = pd.to_datetime(df.Date_real)

Create sub-df, grouping by year to get annual counts of terror attacks

In [4]:
impute = df[['Date','attacktype1','attacktype1_txt']][(df.attacktype1==3)&(df.iyear<=1992)].groupby(['Date','attacktype1']).attacktype1_txt.count().reset_index()
impute.rename(columns={'attacktype1_txt':'num_attacks'},inplace=True)
impute.drop(['attacktype1'],axis=1,inplace=True)

#converting 'date' column to a datetime type
impute['Date'] = pd.to_datetime(impute['Date'])
# resetting date back to the index
impute.set_index('Date',inplace = True)

Look at Data for Stationarity

In [52]:
import matplotlib.pyplot as plt
temp = pd.DataFrame(df[['Date_real','eventid']].groupby('Date_real').eventid.count())
temp.index = pd.to_datetime(temp.index)

plt.plot(figsize=(10,8))
plt.plot(temp)
plt.show()

I'm going to make a judgment call that the data is close enough to stationary that it doesn't need de-trending. It appears as though after the mean peaks around 2013-2014, it begins to fall a bit. Also, due to the sparsity of data in the earlier years, the lines are shorter than maybe they would be had all terror attacks been recorded. Conversely, it's definitely possible that before the rise of social media, less attacks were actually known. This could potentially balance out this line instead of looking like it's trending.

Determine p, q

In [53]:
from pandas.tools.plotting import autocorrelation_plot

autocorrelation_plot(temp)
plt.show()
In [55]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# Look Back 20 months to analyze Autocorr and Partial Autocorr

plot_pacf(impute, lags = 22)
plt.show()

plot_acf(impute, lags = 22)
plt.show()

My data is grouped by year, so lags are in years. Aside from 0 in the partial autocorrelation plot, the line from 1 has the highest partial autocorr, and is the only line outside of the 95% confidence interval. This means that we will set our p=1. For the ACF plot, there is no sharp cutoff, so we will try out q=0.

Confirm (p,q)=(1,0) by testing out all p, q combinations to find the best fit, indicated by the lowest AIC score

In [9]:
from __future__ import division
from statsmodels.tsa.arima_model import ARMA
# Find optimal p, q:
final_AIC = 500
p,q=0,0

for i in range(0,5):
    for j in range(0,5):
        try:
            AIC = ARMA(impute,order=(i,j)).fit().aic
            print i,j,AIC
            if AIC < final_AIC:
                final_AIC = AIC
                p, q =i,j
        except:
            pass
final_AIC, p, q


# Really glad I saved the results from the first time it ran, because I re-ran this and it doesn't do anything... 

# 0 0 363.8556849
# 0 1 342.951384017
# 0 3 328.177608525
# 1 0 315.857908067
# 1 1 317.517697807
# 1 2 318.145000956
# 1 3 nan
# 1 4 331.215672293
# 2 0 317.307563434
# 2 1 316.698746446
# 2 3 nan
# 2 4 nan
# 3 0 317.73817456
# 3 2 334.171843641
# 3 3 nan
# 3 4 nan
# 4 0 319.320923525
# 4 2 nan
# 4 4 nan

# (315.85790806726385, 1, 0)

# Anyways this right here ^^ is the result

This test has confirmed my choice for p=1, q=0.

Create ARMA model, using p=1 (which is just AR(1)) to imply that the prior year has an impact on the current

In [7]:
from matplotlib import pyplot as plt

# fit model
impute.num_attacks = impute.num_attacks.astype(float)
model = ARMA(impute, order=(1,0))
model_fit = model.fit()
print(model_fit.summary())

# plot residual errors
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()
residuals.plot(kind='kde')
plt.show()
plt.plot(impute,color='blue',label='true')
plt.plot(model_fit.fittedvalues, color='red',label='predicted')
plt.legend()
plt.show()
print(residuals.describe())
                              ARMA Model Results                              
==============================================================================
Dep. Variable:            num_attacks   No. Observations:                   23
Model:                     ARMA(1, 0)   Log Likelihood                -154.929
Method:                       css-mle   S.D. of innovations            193.289
Date:                Wed, 10 May 2017   AIC                            315.858
Time:                        21:14:14   BIC                            319.264
Sample:                    01-01-1970   HQIC                           316.715
                         - 01-01-1992                                         
=====================================================================================
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              1036.9474    532.176      1.949      0.065      -6.099    2079.994
ar.L1.num_attacks     0.9552      0.048     19.963      0.000       0.861       1.049
                                    Roots                                    
=============================================================================
                 Real           Imaginary           Modulus         Frequency
-----------------------------------------------------------------------------
AR.1            1.0469           +0.0000j            1.0469            0.0000
-----------------------------------------------------------------------------
                0
count   23.000000
mean    29.227172
std    242.313101
min   -703.947413
25%    -67.130301
50%     43.957914
75%    149.720161
max    538.374743

Predict 1993 number of terror attacks

In [33]:
from statsmodels.tsa.arima_model import _arma_predict_out_of_sample
# get what you need for predicting one-step ahead
params = model_fit.params
residuals = model_fit.resid
p = model_fit.k_ar
q = model_fit.k_ma
k_exog = model_fit.k_exog
k_trend = model_fit.k_trend
steps = 1

_arma_predict_out_of_sample(params, steps, residuals, p, q, k_trend, k_exog, endog=impute, exog=None, start=len(impute))
Out[33]:
array([ 1706.56203623])

Here's a function to predict the number of attacks for the other attack types.

In [14]:
from statsmodels.tsa.arima_model import _arma_predict_out_of_sample
from statsmodels.tsa.arima_model import ARMA
from matplotlib import pyplot as plt

def impute_attacks(attack_type,step=1):

    impute = df[['Date','attacktype1']][(df.attacktype1==attack_type)&(df.iyear<=1992)].groupby('Date').attacktype1.count().reset_index()
    impute.rename(columns={'attacktype1':'num_attacks'},inplace=True)
    #converting 'date' column to a datetime type
    impute['Date'] = pd.to_datetime(impute['Date'])
    # resetting date back to the index
    impute.set_index('Date',inplace = True)
    impute.num_attacks = impute.num_attacks.astype(float)
    model = ARMA(impute, order=(1,0))
    model_fit = model.fit()

    params = model_fit.params
    residuals = model_fit.resid
    p = model_fit.k_ar
    q = model_fit.k_ma
    k_exog = model_fit.k_exog
    k_trend = model_fit.k_trend
    steps = step
    
    attack_txt = df.attacktype1_txt[df.attacktype1==attack_type].unique()[0]
    return attack_txt, _arma_predict_out_of_sample(params, steps, residuals, p, q, k_trend, k_exog, endog=impute, exog=None, start=len(impute))[0]
    

Results:

In [35]:
# loop through each attack type and predict the number of attacks
predicted = []
for attack_type in (df.attacktype1.unique()):
    predicted.append(impute_attacks(attack_type))

# turn into dataframe and spit out total number of 1993 terror attacks
predicted = pd.DataFrame(predicted,columns=['Attack Type','1993 Predicted Number of Attacks'])
print 'Total # of Terror Attacks in 1993: ',predicted.sum()[1]; predicted
Total # of Terror Attacks in 1993:  4886.30141491
//anaconda/lib/python2.7/site-packages/statsmodels/tsa/kalmanf/kalmanfilter.py:649: RuntimeWarning: divide by zero encountered in divide
  R_mat, T_mat)
Out[35]:
Attack Type 1993 Predicted Number of Attacks
0 Assassination 1067.181690
1 Hostage Taking (Kidnapping) 133.048975
2 Bombing/Explosion 1706.553880
3 Facility/Infrastructure Attack 452.218146
4 Armed Assault 1301.690876
5 Hijacking 17.084018
6 Unknown 143.243050
7 Unarmed Assault 43.069847
8 Hostage Taking (Barricade Incident) 22.210933