Predicting Terrorism¶

import pandas as pd
import glob
import pandas as pd
import numpy as np

db = '/Users/jennydoyle/Desktop/dsi/04-project-assets/globalterrorismdb_0616dist.csv'
df = pd.read_csv(db, header=0,low_memory=False)
print 'Size = ', df.shape

Size =  (156772, 137)

I. EDA¶

def eda(dataframe):
    from IPython.core import display as ICD
    print "dataframe shape: ",dataframe.shape      ## rows by columns
    print ''
    print 'num duplicates:', dataframe.duplicated().sum() ## df.drop_duplicates() to remove dupes
    print ''
    print "pct missing values \n"
    ICD.display(pd.DataFrame(dataframe.isnull().sum()/len(df)*100),columns=['Columns','Pct Missing']), ## count number of null values per column
    print ''
    print "dataframe types \n"
    ICD.display(pd.DataFrame(dataframe.dtypes),columns=['Columns','Dtype'])       ## list data type of each column
    print ''
    print "dataframe describe \n"
    ICD.display(pd.DataFrame(dataframe.describe()))## stats -- mean, min, max, etc.. 
    print ''
    print '# unique values in series:\n'
    u_list = []
    for item in dataframe:                             ## count number of unique values per column
        u_list.append([item, dataframe[item].nunique()])
    unique_vals = pd.DataFrame(u_list,columns=['Column','Num_unique'])
    ICD.display(unique_vals)

eda(df)

dataframe shape:  (156772, 137)

num duplicates: 0

pct missing values

dataframe types

dataframe describe

# unique values in series:

Define graph settings¶

%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 8)
plt.rcParams['font.size'] = 30
plt.rcParams['font.family'] = 'Courier New, monospace'
plt.rcParams['xtick.labelsize'] = 20 
plt.rcParams['ytick.labelsize'] = 20

Exploring the Data¶

GOAL: Find something interesting to use in a Bayesian Inference

Types and Counts of Attack Types¶

df.attacktype1_txt.value_counts()

Bombing/Explosion                      75963
Armed Assault                          37554
Assassination                          17582
Hostage Taking (Kidnapping)             9115
Facility/Infrastructure Attack          8849
Unknown                                 5490
Hostage Taking (Barricade Incident)      835
Unarmed Assault                          828
Hijacking                                556
Name: attacktype1_txt, dtype: int64

So Many Graphs¶

plt.hist(df.iyear,bins=46)
plt.title('Number of Attacks by Year',fontsize=30)
plt.show()

import seaborn as sns
plt.rcParams['figure.figsize'] = (10, 8)
# plt.rcParams['font.size'] = 30
plt.rcParams['font.family'] = 'Courier New, monospace'
plt.rcParams['xtick.labelsize'] = 10 
# plt.rcParams['ytick.labelsize'] = 20 
sns.countplot(x="attacktype1_txt", data=df)
plt.title('Number of Attacks by Type',size=30)
plt.xticks(rotation=70)
plt.show()

attacks_by_type = df[['iyear','attacktype1_txt','attacktype1']].groupby(['iyear','attacktype1_txt']).attacktype1.count().reset_index()
attacks_by_type.rename(columns={'attacktype1_txt':'Attack Type'},inplace=True)
attacks_by_type.set_index('iyear',inplace=True)

for atype in list(df.attacktype1_txt.unique()):
    plt.plot(attacks_by_type[['attacktype1']][attacks_by_type['Attack Type']==atype],label=atype,linewidth=3.0)
    # plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Terror Attacks by Type',fontsize=30)
plt.legend(prop={'size':15})
plt.show()

attacks_by_type = df[['iyear','attacktype1_txt','attacktype1']][df.suicide==1].groupby(['iyear','attacktype1_txt']).attacktype1.count().reset_index()
attacks_by_type.rename(columns={'attacktype1_txt':'Attack Type'},inplace=True)
attacks_by_type.set_index('iyear',inplace=True)

for atype in list(df.attacktype1_txt.unique()):
    plt.plot(attacks_by_type[['attacktype1']][attacks_by_type['Attack Type']==atype],label=atype,linewidth=3.0)
    # plt.plot(attacks_by_type,hue='Attack Type')
plt.rcParams['figure.figsize'] = (10, 8)
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Suicide Attacks by Type',fontsize=30)
plt.legend(prop={'size':15})
plt.show()

df['weaptype1_txt'].replace({'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)':'Vehicle'},inplace=True)

attacks_by_weapon = df[['iyear','weaptype1_txt','weaptype1']].groupby(['iyear','weaptype1_txt']).weaptype1.count().reset_index()
attacks_by_weapon.rename(columns={'weaptype1_txt':'Weapon'},inplace=True)
attacks_by_weapon.set_index('iyear',inplace=True)

attacks_by_weapon

for atype in list(df.weaptype1_txt.unique()):
    plt.plot(attacks_by_weapon[['weaptype1']][attacks_by_weapon['Weapon']==atype],label=atype,linewidth=3.0)
    # plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Terror Attacks by Weapon',fontsize=30)
plt.legend(prop={'size':15})
plt.show()

df['weaptype1_txt'].replace({'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)':'Vehicle'},inplace=True)

attacks_by_weapon = df[['iyear','weaptype1_txt','weaptype1']][df.suicide==1].groupby(['iyear','weaptype1_txt']).weaptype1.count().reset_index()
attacks_by_weapon.rename(columns={'weaptype1_txt':'Weapon'},inplace=True)
attacks_by_weapon.set_index('iyear',inplace=True)

attacks_by_weapon

for atype in list(df.weaptype1_txt.unique()):
    plt.plot(attacks_by_weapon[['weaptype1']][attacks_by_weapon['Weapon']==atype],label=atype,linewidth=3.0)
    # plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Suicide Attacks by Weapon',fontsize=30)
plt.legend(prop={'size':15})
plt.show()

import seaborn as sns
# plt.rcParams['figure.figsize'] = (16, 14)
plt.rcParams['font.size'] = 30
plt.rcParams['font.family'] = 'Courier New, monospace'
plt.rcParams['xtick.labelsize'] = 20 
plt.rcParams['ytick.labelsize'] = 20 
countries = df.groupby(['country_txt']).attacktype1.count().reset_index().sort_values('attacktype1',ascending=False)[0:10]
countries = list(countries.country_txt)
countries.append('United States')
mask = df[df.country_txt.isin(countries)]
sns.countplot(x="country_txt", data=mask)
plt.title('Number of Attacks by Country',size=30)
plt.xticks(rotation=70)
plt.show()

attacks_by_region = df[['iyear','region_txt','region']].groupby(['iyear','region_txt']).region.count().reset_index()
attacks_by_region.rename(columns={'region_txt':'Region'},inplace=True)
attacks_by_region.set_index('iyear',inplace=True)

for atype in list(df.region_txt.unique()):
    plt.plot(attacks_by_region[['region']][attacks_by_region['Region']==atype],label=atype,linewidth=3.0)
    # plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Terror Attacks by Region',fontsize=30)
plt.legend(prop={'size':15})
plt.show()

attacks_by_region = df[['iyear','region_txt','region']][df.suicide==1].groupby(['iyear','region_txt']).region.count().reset_index()
attacks_by_region.rename(columns={'region_txt':'Region'},inplace=True)
attacks_by_region.set_index('iyear',inplace=True)

for atype in list(df.region_txt.unique()):
    plt.plot(attacks_by_region[['region']][attacks_by_region['Region']==atype],label=atype,linewidth=3.0)
    # plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Suicide Attacks by Region',fontsize=30)
plt.legend(prop={'size':15})
plt.show()

from matplotlib import pyplot as plt

# get total attacks per year
attacks_by_year = df[['iyear','eventid']].groupby(['iyear']).eventid.count().reset_index()
attacks_by_year.rename(columns={'eventid':'Total Attacks'},inplace=True)

# get number of suicide attacks per year
suicide_by_year = df[['iyear','suicide']].groupby(['iyear']).suicide.sum().reset_index()
# get number of successful attacks per year
successes_by_year = df[['iyear','success']].groupby(['iyear']).success.sum().reset_index()
# get number of suicide attacks per year
nperps_by_year = df[['iyear','nperps']][df.nperps!=-99].groupby(['iyear']).nperps.sum().reset_index()
# get number killed per year
killed_by_year = df[['iyear','nkill']].groupby(['iyear']).nkill.sum().reset_index()
# get number wounded per year
wounded_by_year = df[['iyear','nwound']].groupby(['iyear']).nwound.sum().reset_index()
# get number wounded per year
propval_by_year = df[['iyear','propvalue']].groupby(['iyear']).propvalue.sum().reset_index()

# combine into one df
attacks_by_year = attacks_by_year.merge(suicide_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(successes_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(nperps_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(killed_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(wounded_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(propval_by_year,left_on='iyear',right_on='iyear')

attacks_by_year.rename(columns={'eventid':'Total Attacks',
                                'suicide':'Suicide Attacks',
                                'success':'Successful Attacks',
#                                 'nperps':'Number of Perpetrators',
                                'nkill':'Number Killed',
                                'nwound':'Number Wounded'},inplace=True)
attacks_by_year.set_index('iyear',inplace=True)

plt.plot(attacks_by_year['Total Attacks'], label="Total Attacks",linewidth=3.0)
plt.plot(attacks_by_year['Suicide Attacks'], label="Suicide Attacks",linewidth=3.0)
plt.plot(attacks_by_year['Successful Attacks'], label="Successful Attacks",linewidth=3.0)
# plt.plot(attacks_by_year['Number of Perpetrators'], label="Number of Perps")
plt.plot(attacks_by_year['Number Killed'], label="Number Wounded",linewidth=3.0)
plt.plot(attacks_by_year['Number Wounded'], label="Number Killed",linewidth=3.0)
# plt.plot(attacks_by_year['Total Property Value'], label="Total Property Value")

plt.xlabel('Year',fontsize=30)
plt.ylabel('Number of Attacks')
plt.title('Total Terror Attacks',fontsize=30)
plt.legend(prop={'size':20})
plt.show()

AWESOME¶

Up until now, most of the line graphs have looked pretty similar, but check out the green Suicide Attacks line. It looks nearly constant. However, towards the more recent years, there's a slight trend. I want to investigate this further. Now I don't regret writing up all of this code in the last block.

II. Bayesian Inference¶

After looking at countless graphs, I've decided to look more closely into suicide attacks for my Bayesian estimation.¶

I'm going to see if there's a statistically significant difference between suicide attacks pre 9/11 and post 9/11.

I will do this by comparing number of suicide attacks per year for 10 years before 9/11 and 10 years after 9/11.

# compare distributions

plt.hist(df.iyear[(df.iyear>=1990)&(df.iyear<2001)],bins=46)
plt.title('Number of Attacks by Year--Before 9/11',fontsize=30)
plt.show()
plt.hist(df.iyear[(df.iyear>2001)&(df.iyear<=2011)],bins=46)
plt.title('Number of Attacks by Year--After 9/11',fontsize=30)

<matplotlib.text.Text at 0x123655fd0>

print 'Avg # suicide attacks before 9/11 =', round(df.iyear[(df.iyear>=1990)&(df.iyear<2001)].value_counts().mean(),2),'; total =', df.iyear[(df.iyear>=1990)&(df.iyear<2001)].value_counts().sum()
print 'Avg # suicide attacks after 9/11 =', round(df.iyear[(df.iyear>2001)&(df.iyear<=2011)].value_counts().mean(),2), '; total =', df.iyear[(df.iyear>2001)&(df.iyear<=2011)].value_counts().sum()

Avg # suicide attacks before 9/11 = 3058.0 ; total = 30580
Avg # suicide attacks after 9/11 = 3115.4 ; total = 31154

So they're both pretty high averages, but I can't say with confidence whether or not the difference is statistically significant. Let's try to figure that out.

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pymc3 as pm
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Couldn't import dot_parser, loading of dot files will not be possible.

Determine the Prior¶

QUESTION: By comparing terror attacks before and after 9/11, can we say that the 9/11 attack resulted in a statistically significant difference in number of suicide attacks?

As this was one of the deadliest attacks in hugely populated areas of the US, and was broadly publicized, I would assume that the number of suicide attacks would increase sadly due to popularization of radical extremist groups.

# df with only the suicide attacks
base = df[df.suicide==1]

# df for prior
df_prior = df[df.iyear<1990].iyear.value_counts()

# df for both of the time periods I'm comparing
df_before = df[(df.iyear<2001)&(df.iyear>=1990)].iyear.value_counts()
df_after = df[(df.iyear>2001)&(df.iyear<=2011)].iyear.value_counts()

Set up Prior Distribution¶

# set up bayesian model and priors on means
mean_prior_mean = df_prior.mean()
mean_prior_std  = df_prior.std()

print 'mean =', round(mean_prior_mean,2)
print 'std =', round(mean_prior_std,2)

mean = 2049.8
std = 1256.56

std_prior_lower = 0.01
std_prior_upper = 100.0

# set the prior distribution to both time periods before and after 9/11 
# prior belief that the number of suicide attacks is the same
with pm.Model() as model:

    before_mean = pm.Normal('before_mean', mean_prior_mean, sd=mean_prior_std)
    after_mean = pm.Normal('after_mean', mean_prior_mean, sd=mean_prior_std)

    before_std = pm.Uniform('before_std', lower=std_prior_lower, upper=std_prior_upper)
    after_std = pm.Uniform('after_std', lower=std_prior_lower, upper=std_prior_upper)

Set up Posterior Distribution¶

# using the prior distribution, we will set up the posterior distribution
# so using before/after_means, stds, and then adding the observed suicide attacks 
# in both the time periods before and after 9/11

with model:

    posterior_before = pm.Normal('Before_9_11', mu=before_mean, sd=before_std, observed=df_before)
    posterior_after = pm.Normal('After_9_11)', mu=after_mean, sd=after_std, observed=df_after)

Gather Stats on Posterior Distributions¶

with model:

    diff_of_means = pm.Deterministic('difference of means', after_mean - before_mean)
    diff_of_stds = pm.Deterministic('difference of stds', after_std - before_std)
    effect_size = pm.Deterministic('effect size',
                                   diff_of_means / np.sqrt((after_std**2 + before_std**2) / 2))

with model:
    trace = pm.sample(25000, njobs=4)

Auto-assigning NUTS sampler...
Initializing NUTS using advi...
Average ELBO = -3,116.6: 100%|██████████| 200000/200000 [00:29<00:00, 6700.30it/s]
Finished [100%]: Average ELBO = -3,104.3
100%|██████████| 25000/25000 [02:25<00:00, 198.27it/s]

pm.traceplot(trace);
# these colors are horrendous

pm.plot_posterior(trace[3000:],
                  varnames=['after_mean', 'before_mean', 'before_std', 'after_std'],
                  color='#87ceeb');

pm.plot_posterior(trace[3000:],
                  varnames=['difference of means', 'difference of stds', 'effect size'],
                  ref_val=0,
                  color='#87ceeb');

pm.summary(trace[3000:],
           varnames=['difference of means', 'difference of stds', 'effect size'])

difference of means:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  57.335           44.451           0.152            [-29.818, 143.810]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  -29.719        27.264         57.399         87.175         144.000


difference of stds:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  0.019            0.073            0.000            [-0.123, 0.181]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  -0.116         -0.020         0.010          0.052          0.190


effect size:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  0.574            0.445            0.002            [-0.298, 1.439]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  -0.297         0.273          0.574          0.872          1.441

Bayesian Inference: Interpretation¶

It appears as though the difference in suicide attacks/year before and after 9/11 is not statistically significant. I guess it's a positive thing that 9/11 didn't lead to a huge increase in suicide attacks, but it says nothing about other types of attacks.

III. Impute 1993 Values¶

The codebook for the dataset describes how the poor handling of the data over time has led to the loss of some of the 1993 data. It is therefore excluded from the dataset as to not provide a false representation of the actual distribution.

The instruction was to impute the number of 1993 bombings, but I'll also do it for the other attack types.

Set all dates to Jan 1st of respective year -- I found that this was the only way I could get the ARMA model to work¶

# Compile list of new dates, setting new date to the first day of the year
dates = []
for i in range(0,len(df)):
    dates.append(str(df.iyear.loc[i])+'-01-01')

    
# create a clean date for the actual/real date
dates_real = []
for i in range(0,len(df)):
    month = df.imonth.loc[i]
    day = df.iday.loc[i]
    if month==0:
        month = 1
    if day==0:
        day = 1
    dates_real.append(str(df.iyear.loc[i])+'-'+str(month)+'-'+str(day))


# Add Date series to df
df['Date'] = pd.Series(dates)
df.Date = pd.to_datetime(df.Date)
df['Date_real'] = pd.Series(dates_real)
df.Date_real = pd.to_datetime(df.Date_real)

Create sub-df, grouping by year to get annual counts of terror attacks¶

impute = df[['Date','attacktype1','attacktype1_txt']][(df.attacktype1==3)&(df.iyear<=1992)].groupby(['Date','attacktype1']).attacktype1_txt.count().reset_index()
impute.rename(columns={'attacktype1_txt':'num_attacks'},inplace=True)
impute.drop(['attacktype1'],axis=1,inplace=True)

#converting 'date' column to a datetime type
impute['Date'] = pd.to_datetime(impute['Date'])
# resetting date back to the index
impute.set_index('Date',inplace = True)

Look at Data for Stationarity¶

import matplotlib.pyplot as plt
temp = pd.DataFrame(df[['Date_real','eventid']].groupby('Date_real').eventid.count())
temp.index = pd.to_datetime(temp.index)

plt.plot(figsize=(10,8))
plt.plot(temp)
plt.show()

I'm going to make a judgment call that the data is close enough to stationary that it doesn't need de-trending. It appears as though after the mean peaks around 2013-2014, it begins to fall a bit. Also, due to the sparsity of data in the earlier years, the lines are shorter than maybe they would be had all terror attacks been recorded. Conversely, it's definitely possible that before the rise of social media, less attacks were actually known. This could potentially balance out this line instead of looking like it's trending.

Determine p, q¶

from pandas.tools.plotting import autocorrelation_plot

autocorrelation_plot(temp)
plt.show()

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# Look Back 20 months to analyze Autocorr and Partial Autocorr

plot_pacf(impute, lags = 22)
plt.show()

plot_acf(impute, lags = 22)
plt.show()

My data is grouped by year, so lags are in years. Aside from 0 in the partial autocorrelation plot, the line from 1 has the highest partial autocorr, and is the only line outside of the 95% confidence interval. This means that we will set our p=1. For the ACF plot, there is no sharp cutoff, so we will try out q=0.

Confirm (p,q)=(1,0) by testing out all p, q combinations to find the best fit, indicated by the lowest AIC score¶

from __future__ import division
from statsmodels.tsa.arima_model import ARMA
# Find optimal p, q:
final_AIC = 500
p,q=0,0

for i in range(0,5):
    for j in range(0,5):
        try:
            AIC = ARMA(impute,order=(i,j)).fit().aic
            print i,j,AIC
            if AIC < final_AIC:
                final_AIC = AIC
                p, q =i,j
        except:
            pass
final_AIC, p, q


# Really glad I saved the results from the first time it ran, because I re-ran this and it doesn't do anything... 

# 0 0 363.8556849
# 0 1 342.951384017
# 0 3 328.177608525
# 1 0 315.857908067
# 1 1 317.517697807
# 1 2 318.145000956
# 1 3 nan
# 1 4 331.215672293
# 2 0 317.307563434
# 2 1 316.698746446
# 2 3 nan
# 2 4 nan
# 3 0 317.73817456
# 3 2 334.171843641
# 3 3 nan
# 3 4 nan
# 4 0 319.320923525
# 4 2 nan
# 4 4 nan

# (315.85790806726385, 1, 0)

# Anyways this right here ^^ is the result

This test has confirmed my choice for p=1, q=0.

Create ARMA model, using p=1 (which is just AR(1)) to imply that the prior year has an impact on the current¶

from matplotlib import pyplot as plt

# fit model
impute.num_attacks = impute.num_attacks.astype(float)
model = ARMA(impute, order=(1,0))
model_fit = model.fit()
print(model_fit.summary())

# plot residual errors
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()
residuals.plot(kind='kde')
plt.show()
plt.plot(impute,color='blue',label='true')
plt.plot(model_fit.fittedvalues, color='red',label='predicted')
plt.legend()
plt.show()
print(residuals.describe())

                              ARMA Model Results                              
==============================================================================
Dep. Variable:            num_attacks   No. Observations:                   23
Model:                     ARMA(1, 0)   Log Likelihood                -154.929
Method:                       css-mle   S.D. of innovations            193.289
Date:                Wed, 10 May 2017   AIC                            315.858
Time:                        21:14:14   BIC                            319.264
Sample:                    01-01-1970   HQIC                           316.715
                         - 01-01-1992                                         
=====================================================================================
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              1036.9474    532.176      1.949      0.065      -6.099    2079.994
ar.L1.num_attacks     0.9552      0.048     19.963      0.000       0.861       1.049
                                    Roots                                    
=============================================================================
                 Real           Imaginary           Modulus         Frequency
-----------------------------------------------------------------------------
AR.1            1.0469           +0.0000j            1.0469            0.0000
-----------------------------------------------------------------------------

                0
count   23.000000
mean    29.227172
std    242.313101
min   -703.947413
25%    -67.130301
50%     43.957914
75%    149.720161
max    538.374743

Predict 1993 number of terror attacks¶

from statsmodels.tsa.arima_model import _arma_predict_out_of_sample
# get what you need for predicting one-step ahead
params = model_fit.params
residuals = model_fit.resid
p = model_fit.k_ar
q = model_fit.k_ma
k_exog = model_fit.k_exog
k_trend = model_fit.k_trend
steps = 1

_arma_predict_out_of_sample(params, steps, residuals, p, q, k_trend, k_exog, endog=impute, exog=None, start=len(impute))

array([ 1706.56203623])

Here's a function to predict the number of attacks for the other attack types.

from statsmodels.tsa.arima_model import _arma_predict_out_of_sample
from statsmodels.tsa.arima_model import ARMA
from matplotlib import pyplot as plt

def impute_attacks(attack_type,step=1):

    impute = df[['Date','attacktype1']][(df.attacktype1==attack_type)&(df.iyear<=1992)].groupby('Date').attacktype1.count().reset_index()
    impute.rename(columns={'attacktype1':'num_attacks'},inplace=True)
    #converting 'date' column to a datetime type
    impute['Date'] = pd.to_datetime(impute['Date'])
    # resetting date back to the index
    impute.set_index('Date',inplace = True)
    impute.num_attacks = impute.num_attacks.astype(float)
    model = ARMA(impute, order=(1,0))
    model_fit = model.fit()

    params = model_fit.params
    residuals = model_fit.resid
    p = model_fit.k_ar
    q = model_fit.k_ma
    k_exog = model_fit.k_exog
    k_trend = model_fit.k_trend
    steps = step
    
    attack_txt = df.attacktype1_txt[df.attacktype1==attack_type].unique()[0]
    return attack_txt, _arma_predict_out_of_sample(params, steps, residuals, p, q, k_trend, k_exog, endog=impute, exog=None, start=len(impute))[0]

Results:¶

# loop through each attack type and predict the number of attacks
predicted = []
for attack_type in (df.attacktype1.unique()):
    predicted.append(impute_attacks(attack_type))

# turn into dataframe and spit out total number of 1993 terror attacks
predicted = pd.DataFrame(predicted,columns=['Attack Type','1993 Predicted Number of Attacks'])
print 'Total # of Terror Attacks in 1993: ',predicted.sum()[1]; predicted

Total # of Terror Attacks in 1993:  4886.30141491

//anaconda/lib/python2.7/site-packages/statsmodels/tsa/kalmanf/kalmanfilter.py:649: RuntimeWarning: divide by zero encountered in divide
  R_mat, T_mat)

	0
eventid	0.000000
iyear	0.000000
imonth	0.000000
iday	0.000000
approxdate	96.966295
extended	0.000000
resolution	97.766183
country	0.000000
country_txt	0.000000
region	0.000000
region_txt	0.000000
provstate	9.261858
city	0.284490
latitude	2.882530
longitude	2.882530
specificity	0.000000
vicinity	0.000000
location	73.074911
summary	42.188656
crit1	0.000000
crit2	0.000000
crit3	0.000000
doubtterr	0.000638
alternative	84.540607
alternative_txt	0.000000
multiple	0.000000
success	0.000000
suicide	0.000000
attacktype1	0.000000
attacktype1_txt	0.000000
...	...
propextent	64.054806
propextent_txt	0.000000
propvalue	80.027046
propcomment	68.475238
ishostkid	0.113541
nhostkid	92.812492
nhostkidus	92.847575
nhours	97.893757
ndays	95.801546
divert	99.815656
kidhijcountry	97.901411
ransom	52.101141
ransomamt	99.237747
ransomamtus	99.737836
ransompaid	99.602608
ransompaidus	99.743577
ransomnote	99.731457
hostkidoutcome	94.460108
hostkidoutcome_txt	0.000000
nreleased	94.836450
addnotes	86.015360
scite1	42.309851
scite2	60.987294
scite3	78.228255
dbsource	0.000000
INT_LOG	0.000000
INT_IDEO	0.000000
INT_MISC	0.000000
INT_ANY	0.000000
related	86.973439

	0
eventid	int64
iyear	int64
imonth	int64
iday	int64
approxdate	object
extended	int64
resolution	object
country	int64
country_txt	object
region	int64
region_txt	object
provstate	object
city	object
latitude	float64
longitude	float64
specificity	int64
vicinity	int64
location	object
summary	object
crit1	int64
crit2	int64
crit3	int64
doubtterr	float64
alternative	float64
alternative_txt	object
multiple	int64
success	int64
suicide	int64
attacktype1	int64
attacktype1_txt	object
...	...
propextent	float64
propextent_txt	object
propvalue	float64
propcomment	object
ishostkid	float64
nhostkid	float64
nhostkidus	float64
nhours	float64
ndays	float64
divert	object
kidhijcountry	object
ransom	float64
ransomamt	float64
ransomamtus	float64
ransompaid	float64
ransompaidus	float64
ransomnote	object
hostkidoutcome	float64
hostkidoutcome_txt	object
nreleased	float64
addnotes	object
scite1	object
scite2	object
scite3	object
dbsource	object
INT_LOG	int64
INT_IDEO	int64
INT_MISC	int64
INT_ANY	int64
related	object

	eventid	iyear	imonth	iday	extended	country	region	latitude	longitude	specificity	...	ransomamt	ransomamtus	ransompaid	ransompaidus	hostkidoutcome	nreleased	INT_LOG	INT_IDEO	INT_MISC	INT_ANY
count	1.567720e+05	156772.000000	156772.000000	156772.000000	156772.000000	156772.000000	156772.000000	152253.000000	152253.000000	156772.000000	...	1.195000e+03	4.110000e+02	6.230000e+02	402.000000	8685.000000	8095.000000	156772.000000	156772.000000	156772.000000	156772.000000
mean	2.000541e+11	2000.474083	6.484666	15.455215	0.041347	133.087401	6.970097	23.190988	24.210467	1.452632	...	3.320127e+06	5.454451e+05	4.319721e+05	305.196517	4.592170	-27.788635	-4.834645	-4.789114	0.093894	-4.221124
std	1.298283e+09	12.982397	3.392225	8.815533	0.199091	113.946290	2.967803	19.220723	59.900831	1.016971	...	3.187694e+07	6.665967e+06	2.589893e+06	3409.027685	2.049184	58.524976	4.528862	4.589779	0.602442	4.686143
min	1.970000e+11	1970.000000	0.000000	0.000000	0.000000	4.000000	1.000000	-53.154613	-176.176447	1.000000	...	-9.900000e+01	-9.900000e+01	-9.900000e+01	-99.000000	1.000000	-99.000000	-9.000000	-9.000000	-9.000000	-9.000000
25%	1.989082e+11	1989.000000	4.000000	8.000000	0.000000	69.000000	5.000000	10.756961	-1.929857	1.000000	...	0.000000e+00	0.000000e+00	-9.900000e+01	0.000000	2.000000	-99.000000	-9.000000	-9.000000	0.000000	-9.000000
50%	2.005071e+11	2005.000000	6.000000	15.000000	0.000000	101.000000	6.000000	31.285506	41.919647	1.000000	...	1.250000e+04	0.000000e+00	0.000000e+00	0.000000	4.000000	0.000000	-9.000000	-9.000000	0.000000	0.000000
75%	2.013060e+11	2013.000000	9.000000	23.000000	0.000000	160.000000	10.000000	34.842222	68.416974	1.000000	...	4.115000e+05	0.000000e+00	4.276840e+03	0.000000	7.000000	1.000000	0.000000	0.000000	0.000000	0.000000
max	2.015123e+11	2015.000000	12.000000	31.000000	1.000000	1004.000000	12.000000	74.633553	179.366667	5.000000	...	1.000000e+09	1.320000e+08	4.100000e+07	48000.000000	7.000000	1201.000000	1.000000	1.000000	1.000000	1.000000

	Column	Num_unique
0	eventid	156772
1	iyear	45
2	imonth	13
3	iday	32
4	approxdate	1426
5	extended	2
6	resolution	2657
7	country	206
8	country_txt	206
9	region	12
10	region_txt	12
11	provstate	2509
12	city	31324
13	latitude	52021
14	longitude	51632
15	specificity	5
16	vicinity	3
17	location	35794
18	summary	34335
19	crit1	2
20	crit2	2
21	crit3	2
22	doubtterr	3
23	alternative	5
24	alternative_txt	6
25	multiple	2
26	success	2
27	suicide	2
28	attacktype1	9
29	attacktype1_txt	9
...	...	...
107	propextent	4
108	propextent_txt	5
109	propvalue	604
110	propcomment	17437
111	ishostkid	3
112	nhostkid	221
113	nhostkidus	28
114	nhours	34
115	ndays	289
116	divert	142
117	kidhijcountry	217
118	ransom	3
119	ransomamt	350
120	ransomamtus	21
121	ransompaid	122
122	ransompaidus	8
123	ransomnote	285
124	hostkidoutcome	7
125	hostkidoutcome_txt	8
126	nreleased	155
127	addnotes	9613
128	scite1	66698
129	scite2	50118
130	scite3	28489
131	dbsource	26
132	INT_LOG	3
133	INT_IDEO	3
134	INT_MISC	3
135	INT_ANY	3
136	related	18082

	Attack Type	1993 Predicted Number of Attacks
0	Assassination	1067.181690
1	Hostage Taking (Kidnapping)	133.048975
2	Bombing/Explosion	1706.553880
3	Facility/Infrastructure Attack	452.218146
4	Armed Assault	1301.690876
5	Hijacking	17.084018
6	Unknown	143.243050
7	Unarmed Assault	43.069847
8	Hostage Taking (Barricade Incident)	22.210933