forked from ulfelder/dart-throwing-chimp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathacled.importer.R
80 lines (69 loc) · 5.56 KB
/
acled.importer.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# This script will download, unzip, prep, and merge ACLED's Version 5 historical data (1997-2014) and its realtime
# data. The downside is that it will need to be modified as the realtime data are updated, or if the url or filename for
# the historical data changes. The upside is that those changes only need to be made in the "File info" section that
# starts on line 21. Unless the basic structure of the files or their variable names change, the rest should keep working.
# See https://dartthrowingchimp.wordpress.com/2015/07/19/acled-in-r/ for a blog post discussing the script.
# Load required packages
library(dplyr)
library(tidyr)
library(countrycode)
library(ggplot2)
# Info on files to be ingested. **THIS IS THE PART THAT NEEDS TO BE UPDATED BY HAND AS ACLED POSTS UPDATES.** To get
# the strings that follow, I did the following on a Windows-driven PC:
# 1. Pointed my browser to http://www.acleddata.com/data/
# 2. Clicked on http://www.acleddata.com/data/version-5-data-1997-2014/ to get info on historical data
# 3. Right-clicked on the (csv) hyperlink for 'ACLED Version 5 (1997 – 2014) standard file' and selected 'Copy link address'
# 4. Used Ctrl-V to paste that in between quotation marks in the past.url slot below.
# 5. Left-clicked on that same link to download the .zip file
# 6. Double-clicked on the downloaded .zip file to inspect the contents
# 7. Right-clicked on the .csv in the resulting window, selected 'Properties', and used Ctrl-C to copy the csv file's name
# 8. Used Ctrl-V to paste that file name in between quotation marks in the past.file slot below
# 9. Back on the ACLED site, clicked on 'Realtime Data (2015)'
# 10. Repeated steps 3 through 8 for 'Realtime 2015 All Africa File (updated 11th July 2015)(csv)' and the realtime.* slots below
past.url <- "http://www.acleddata.com/wp-content/uploads/2015/06/ACLED-Version-5-All-Africa-1997-2014_dyadic_Updated_csv-no-notes.zip"
past.file <- "ACLED-Version-5-All-Africa-1997-2014_dyadic_Updated_no_notes.csv"
realtime.url <- "http://www.acleddata.com/wp-content/uploads/2015/07/ACLED-All-Africa-File_20150101-to-20150711_csv.zip"
realtime.file <- "ACLED All Africa File_20150101 to 20150711_csv.csv"
# Function to get zip file and extract csv using vector of two string objects and returning data frame
getfile <- function(vector) {
temp <- tempfile()
download.file(vector[1], temp)
df <- read.csv(unz(temp, vector[2]), stringsAsFactors=FALSE)
unlink(temp)
return(df)
}
# Data fetching
ACLED.targets <- list(c(past.url, past.file), c(realtime.url, realtime.file)) # Make list of target dataset info
ACLED.list <- lapply(ACLED.targets, getfile) # Use function created above to ingest files into list form
names(ACLED.list[[1]]) <- sub("GEO_PRECIS", "GEO_PRECISION", names(ACLED.list[[1]])) # Fix name of var in Version 5 to match realtime
names(ACLED.list[[2]]) <- gsub("ADM_LEVEL_", "ADMIN", names(ACLED.list[[2]])) # Fix names of location vars to match Version 5
ACLED <- Reduce(function(...) merge(..., all=TRUE), ACLED.list) # Merge all files in the list, keeping all non-duplicate rows
names(ACLED) <- tolower(names(ACLED)) # Convert var names in merged file to lower case
# Inspect the result to make sure it's worked as expected
str(ACLED)
# Get country-month counts of each event type and add column counting all battles of any type
ACLED.cm <- ACLED %>%
mutate(event_type = make.names(tolower(event_type))) %>% # Change event type labels for use as proper var names, and to deal with "Remote Violence", "Remote violence"
mutate(month = as.numeric(substr(event_date, 4, 5))) %>% # Create month var to use in grouping
group_by(gwno, year, month, event_type) %>% # Define groupings from highest to lowest level; data are automatically ordered accordingly
tally(.) %>% # Get counts of records in each group (i.e., each country/year/month/type subset)
spread(., key = event_type, value = n, fill = 0) %>% # Make data wide by spreading event types into columns
left_join(expand(., gwno, year, month), .) %>% # Expand data frame to cover all possible country-months by left-joining tallies to complete series created with expand() from tidyr
replace(is.na(.), 0) %>% # Replace all NAs created by that last step with 0s
mutate(., battles = rowSums(select(., contains("battle")))) %>% # Create var summing counts of all battle types
filter(., year < as.numeric(substr(Sys.Date(), 1, 4)) | (year == as.numeric(substr(Sys.Date(), 1, 4)) & month < as.numeric(substr(Sys.Date(), 6, 7)))) %>% # Drop rows for months that haven't happened yet
mutate(., country = countrycode(gwno, "cown", "country.name", warn = FALSE)) # Use 'countrycode' to add country names based on COW numeric codes
# Some examples of time-series plots
# Montly counts of events involving violence against civilians in Burundi
ACLED.cm %>%
filter(., country == "Burundi") %>%
mutate(., yearmo = as.Date(paste(year, ifelse(month < 10, paste0("0", month), month), "01", sep="-"))) %>%
qplot(data = ., x = yearmo, y = violence.against.civilians, geom = "line", xlab="Month", ylab="Event count") + ggtitle("Violence against civilians in Burundi") %>%
print
# Monthly counts of battles by country (small multiples)
ACLED.cm %>%
mutate(., yearmo = as.Date(paste(year, ifelse(month < 10, paste0("0", month), month), "01", sep="-"))) %>%
qplot(data = ., x = yearmo, y = battles, facets = ~country, geom = "line", xlab="Month", ylab="Event count: battles") %>%
print
# If you'd rather save that plot to your hard drive, replace the 'print %>%' line with one like this:
# ggsave("ACLED.battles.ts.png", path = "[path to directory in which to save it]", width = 6, height = 8, units="in")