diff --git a/README.md b/README.md index 562bb00..6292982 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,9 @@ -# censorwatch -Website for Censorwatch, CIS' project to map net neutrality violations and web censorship in India +# CensorWatch + +This repository hosts the website for CensorWatch, CIS' project to map net neutrality violations and web censorship in India, the data collected through the project and the scripts used to analyse the data + +To download the data, visit the Releases section of this repository. + +To import the data (using Mongo DB), extract the zip archive and inside the directory run `mongorestore --db log_database`. + +To analyse the data (using R), see the `analysis_scripts` directory. `connect.R` and `query1.R` are useful starting points. diff --git a/analysis_scripts/confirm_DNS_blocks.R b/analysis_scripts/confirm_DNS_blocks.R new file mode 100644 index 0000000..ff30f7a --- /dev/null +++ b/analysis_scripts/confirm_DNS_blocks.R @@ -0,0 +1,32 @@ +censorious_DNS_servers <- c("203.109.71.154", "123.176.40.68", "106.51.113.17", "123.176.40.69", "49.207.46.38", "123.176.40.67", "49.207.46.62", "202.83.21.15", "49.205.75.6", "202.83.24.75", "202.83.21.14", "218.248.112.60") + +for (server in censorious_DNS_servers) { + query_with_parameters <- paste('{ "result.test_result.resolved_ip": "', server, '"}', sep = "") + responses <- dnsprobe$find(query = query_with_parameters, fields='{"_id": 1}') + + for (response in responses$'_id') { + query_with_parameters <- paste('{"_id": { "$oid" : "', response, '" } }', sep = "") + updates <- dnsprobe$update(query = query_with_parameters, update = '{ "$set" : { "confirmed_block" : "True"} }') + } +} + + +mongo$update( + query = paste0('{"_id": { "$oid" : "', mongoID, '" } }'), + update = '{ "$set" : { "confirmed_block" : "True"} }' +) + + +bad_measurements <- c('AS133997','AS132559','AS132976', 'AS133287', 'AS134177', 'AS134293', 'AS134674' , 'AS135690', 'AS136305', 'AS138277', 'AS139567', 'AS45235', 'AS58965', 'AS133720') +for (asn in bad_measurements) { + query_with_parameters <- paste('{ "ip_info.asn.asn": "', asn, '"}', sep = "") + responses <- tlsprobe$find(query = query_with_parameters, fields='{"_id": 1}') + if (length(responses) == 0) { + next + } + + for (response in responses$'_id') { + query_with_parameters <- paste('{"_id": { "$oid" : "', response, '" } }', sep = "") + updates <- tlsprobe$update(query = query_with_parameters, update = '{ "$set" : { "invalid" : "True"} }') + } +} diff --git a/analysis_scripts/connect.R b/analysis_scripts/connect.R new file mode 100644 index 0000000..ce16185 --- /dev/null +++ b/analysis_scripts/connect.R @@ -0,0 +1,4 @@ +connection_string = 'mongodb://localhost:27017/?readPreference=primary&appname=MongoDB%20Compass&ssl=false' +dnsprobe = mongo(collection="dnsprobe", db="log_database", url=connection_string) +tlsprobe = mongo(collection="tlsprobe", db="log_database", url=connection_string) +httpprobe = mongo(collection="http2probe", db="log_database", url=connection_string) \ No newline at end of file diff --git a/analysis_scripts/final_table.r b/analysis_scripts/final_table.r new file mode 100644 index 0000000..a71c7da --- /dev/null +++ b/analysis_scripts/final_table.r @@ -0,0 +1,151 @@ +httpHostsBlockedPerASN <- httpprobe$aggregate('[{"$group":{"_id": {"host":"$hostname", "asn_number": "$ip_info.asn.asn"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error", false]}, 1, 0 ] }}, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') +dnsHostsBlockedPerASN <- dnsprobe$aggregate('[{"$group":{"_id": {"host":"$hostname", "asn_number": "$ip_info.asn.asn"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$sourceIp", false]}, 1, 0 ] }}, "Num_unknown": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "Unknown" ] }, 1, 0 ] }} ,"Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$confirmed_block", "True" ] }, 1, 0 ] }} }}]') +tlsHostsBlockedPerASN <- tlsprobe$aggregate('[{"$group":{"_id": {"host":"$sniHostname", "asn_number": "$ip_info.asn.asn"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error1", false]}, 1, 0 ] }}, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +dnsprobe_unique_AS <- dnsprobe$distinct("ip_info.asn.asn") +httpprobe_unique_AS <- httpprobe$distinct("ip_info.asn.asn") +tlsprobe_unique_AS <- tlsprobe$distinct("ip_info.asn.asn") + +httpprobe_hosts <- httpprobe$distinct("hostname") + +httpprobe_AS_number_name_map <- httpprobe$aggregate('[{"$group":{"_id":"$ip_info.asn.asn", "AS_name": { "$first" : "$ip_info.asn.name" }}}]') + +unique_ASNs <- union( union(httpprobe_unique_AS, dnsprobe_unique_AS), tlsprobe_unique_AS) +ASNs_to_consider <- unique_ASNs[!unique_ASNs %in% bad_measurements] + +Hosts_to_consider <- httpprobe_hosts[!httpprobe_hosts %in% bad_websites] +Hosts_to_consider <- Hosts_to_consider[!Hosts_to_consider %in% high_error_websites] + +empty_list <- rep(0, length(ASNs_to_consider)) +results3 <- data.frame(asn = ASNs_to_consider, asn_name = empty_list, measurements = empty_list, number_of_blocked_sites = empty_list, number_of_inconclusive = empty_list, number_of_unmeasured_sites = empty_list) + +#for (host in Hosts_to_consider) { +# if (is.null(results3[[host]])) { +# results3[[host]] = data.frame(measurements = empty_list, errors = empty_list, blanks = empty_list, blocklist = empty_list) +# } +#} +net_blocklist <- c() + +for(x in 1:length(ASNs_to_consider)) { + asn <- ASNs_to_consider[x] + + blocklist <- c() + dns_hosts_measured <- c() + tls_hosts_measured <- c() + http_hosts_measured <- c() + + tls_hosts_error <- c() + dns_hosts_error <- c() + http_hosts_error <- c() + + dns_blocks <- 0 + tls_blocks <- 0 + http_blocks <- 0 + + dns_measurements <- 0 + tls_measurements <- 0 + http_measurements <- 0 + + dns_errors <- c() + tls_errors <- c() + http_errors <- c() + + dns_blanks <- 0 + tls_blanks <- 0 + http_blanks <- 0 + + indexes <- which(dnsHostsBlockedPerASN$"_id"$asn_number == asn) + #cat("here ", length(indexes), "\n") + for(index in indexes) { + host <- dnsHostsBlockedPerASN$"_id"$host[index] + #cat("here2 ", host, "\n") + + if(!(host %in% Hosts_to_consider)) { + next + } + + if(dnsHostsBlockedPerASN$measurements[index] != dnsHostsBlockedPerASN$Num_unknown[index]) { + dns_hosts_measured <- append(dns_hosts_measured, host) + } + + if(dnsHostsBlockedPerASN$measurements[index] == dnsHostsBlockedPerASN$Num_error[index]) { + dns_hosts_error <- append(dns_hosts_error, host) + } + + dns_measurements <- dns_measurements + dnsHostsBlockedPerASN$measurements[index] + + if (dnsHostsBlockedPerASN$Num_blocked[index] >= 1) { + blocklist <- append(blocklist, host) + } + + #cat("here2 measure: ", dnsHostsBlockedPerASN$measurements[index], " blocks ", dnsHostsBlockedPerASN$Num_blocked[index], "\n") + + } + dns_blanks <- setdiff(Hosts_to_consider, dns_hosts_measured) + #dns_errors <- setdiff(Hosts_to_consider, dns_hosts_error) + #dns_hosts_error <- dns_hosts_error[!dns_hosts_error %in% dns_hosts_measured] + + + indexes <- which(tlsHostsBlockedPerASN$"_id"$asn_number == asn) + for(index in indexes) { + host <- tlsHostsBlockedPerASN$"_id"$host[index] + + if(!(host %in% Hosts_to_consider)) { + next + } + + tls_hosts_measured <- append(tls_hosts_measured, host) + + if(tlsHostsBlockedPerASN$measurements[index] == tlsHostsBlockedPerASN$Num_error[index]) { + tls_hosts_error <- append(tls_hosts_error, host) + } + + tls_measurements <- tls_measurements + tlsHostsBlockedPerASN$measurements[index] + + if (tlsHostsBlockedPerASN$Num_blocked[index] >= 1) { + blocklist <- append(blocklist, host) + } + } + tls_blanks <- setdiff(Hosts_to_consider, tls_hosts_measured) + + indexes <- which(httpHostsBlockedPerASN$"_id"$asn_number == asn) + for(index in indexes) { + host <- httpHostsBlockedPerASN$"_id"$host[index] + + if(!(host %in% Hosts_to_consider)) { + next + } + + http_hosts_measured <- append(http_hosts_measured, host) + + if(httpHostsBlockedPerASN$measurements[index] == httpHostsBlockedPerASN$Num_error[index]) { + http_hosts_error <- append(http_hosts_error, host) + } + + http_measurements <- http_measurements + httpHostsBlockedPerASN$measurements[index] + + if (httpHostsBlockedPerASN$Num_blocked[index] >= 1) { + blocklist <- append(blocklist, host) + } + } + http_blanks <- setdiff(Hosts_to_consider, http_hosts_measured) + #http_hosts_error <- http_hosts_error[!http_hosts_error %in% http_hosts_measured] + #http_errors <- setdiff(Hosts_to_consider, http_hosts_error) + + # cat("asn:", asn, " measurements: ", (dns_measurements + tls_measurements + http_measurements), " dns_blocks ", dns_blocks, " tls_blocks ", tls_blocks, " http_blocks ", http_blocks, "for host ", host, "\n") + blocklist <- unique(blocklist) + results3$number_of_blocked_sites[x] <- length(blocklist) + + list_of_inconclusive <- union( union(dns_hosts_error, tls_hosts_error), http_hosts_error) + list_of_inconclusive <- list_of_inconclusive[!list_of_inconclusive %in% blocklist] + results3$number_of_inconclusive[x] <- length(list_of_inconclusive) + + list_of_unmeasured <- union( union(dns_blanks, tls_blanks), http_blanks) + list_of_unmeasured <- list_of_unmeasured[!list_of_unmeasured %in% blocklist] + results3$number_of_unmeasured_sites[x] <- length(list_of_unmeasured) + + results3$measurements[x] <- (dns_measurements + tls_measurements + http_measurements) / (length(Hosts_to_consider) * 3) + results3$asn_name[x] <- httpprobe_AS_number_name_map$AS_name[which(httpprobe_AS_number_name_map$"_id" == asn)] + + net_blocklist <- union(net_blocklist, blocklist) +} diff --git a/analysis_scripts/final_table_regions.r b/analysis_scripts/final_table_regions.r new file mode 100644 index 0000000..5529e84 --- /dev/null +++ b/analysis_scripts/final_table_regions.r @@ -0,0 +1,144 @@ +httpHostsBlockedPerState <- httpprobe$aggregate('[{"$match": {"invalid": {"$ne": "True"}}} , {"$group":{"_id": {"host":"$hostname", "region": "$state"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error", false]}, 1, 0 ] }}, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') +dnsHostsBlockedPerState <- dnsprobe$aggregate('[{"$match": {"invalid": {"$ne": "True"}}} , {"$group":{"_id": {"host":"$hostname", "region": "$state"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error", false]}, 1, 0 ] }}, "Num_unknown": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "Unknown" ] }, 1, 0 ] }} , "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$confirmed_block", "True" ] }, 1, 0 ] }} }}]') +tlsHostsBlockedPerState <- tlsprobe$aggregate('[{"$match": {"invalid": {"$ne": "True"}}} , {"$group":{"_id": {"host":"$sniHostname", "region": "$state"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error1", false]}, 1, 0 ] }}, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +dnsprobe_unique_state <- dnsprobe$distinct("state") +httpprobe_unique_state <- httpprobe$distinct("state") +httpprobe_hosts <- httpprobe$distinct("hostname") + +unique_ASNs <- union(httpprobe_unique_AS, dnsprobe_unique_AS) +ASNs_to_consider <- unique_ASNs[!unique_ASNs %in% bad_measurements] + +unique_states <- union(httpprobe_unique_state, dnsprobe_unique_state) + +Hosts_to_consider <- httpprobe_hosts[!httpprobe_hosts %in% bad_websites] +Hosts_to_consider <- Hosts_to_consider[!Hosts_to_consider %in% high_error_websites] + +empty_list <- rep(0, length(unique_states)) +results4 <- data.frame(state = unique_states, measurements = empty_list, number_of_blocked_sites = empty_list, number_of_inconclusive = empty_list, number_of_unmeasured_sites = empty_list) + +#for (host in Hosts_to_consider) { +# if (is.null(results4[[host]])) { +# results4[[host]] = data.frame(measurements = empty_list, errors = empty_list, blanks = empty_list, blocklist = empty_list) +# } +#} + +for(x in 1:length(unique_states)) { + state <- unique_states[x] + + blocklist <- c() + dns_hosts_measured <- c() + tls_hosts_measured <- c() + http_hosts_measured <- c() + + tls_hosts_error <- c() + dns_hosts_error <- c() + http_hosts_error <- c() + + dns_blocks <- 0 + tls_blocks <- 0 + http_blocks <- 0 + + dns_measurements <- 0 + tls_measurements <- 0 + http_measurements <- 0 + + dns_blanks <- 0 + tls_blanks <- 0 + http_blanks <- 0 + + indexes <- which(dnsHostsBlockedPerState$"_id"$region == state) + #cat("here ", length(indexes), "\n") + for(index in indexes) { + host <- dnsHostsBlockedPerState$"_id"$host[index] + #cat("here2 ", host, "\n") + + if(!(host %in% Hosts_to_consider)) { + next + } + + if(dnsHostsBlockedPerState$measurements[index] != dnsHostsBlockedPerState$Num_unknown[index]) { + dns_hosts_measured <- append(dns_hosts_measured, host) + } + + if(dnsHostsBlockedPerState$measurements[index] == dnsHostsBlockedPerState$Num_error[index]) { + dns_hosts_error <- append(dns_hosts_error, host) + } + + dns_measurements <- dns_measurements + dnsHostsBlockedPerState$measurements[index] + + if (dnsHostsBlockedPerState$Num_blocked[index] >= 1) { + blocklist <- append(blocklist, host) + } + + #cat("here2 measure: ", dnsHostsBlockedPerASN$measurements[index], " blocks ", dnsHostsBlockedPerASN$Num_blocked[index], "\n") + + } + dns_blanks <- setdiff(Hosts_to_consider, dns_hosts_measured) + + indexes <- which(tlsHostsBlockedPerState$"_id"$region == state) + for(index in indexes) { + host <- tlsHostsBlockedPerState$"_id"$host[index] + + if(!(host %in% Hosts_to_consider)) { + next + } + + tls_hosts_measured <- append(tls_hosts_measured, host) + + if(tlsHostsBlockedPerState$measurements[index] == tlsHostsBlockedPerState$Num_error[index]) { + tls_hosts_error <- append(tls_hosts_error, host) + } + + tls_measurements <- tls_measurements + tlsHostsBlockedPerState$measurements[index] + + if (tlsHostsBlockedPerState$Num_blocked[index] >= 1) { + blocklist <- append(blocklist, host) + } + } + tls_blanks <- setdiff(Hosts_to_consider, tls_hosts_measured) + + + indexes <- which(httpHostsBlockedPerState$"_id"$region == state) + for(index in indexes) { + host <- httpHostsBlockedPerState$"_id"$host[index] + + if(!(host %in% Hosts_to_consider)) { + next + } + + http_hosts_measured <- append(http_hosts_measured, host) + + if(httpHostsBlockedPerState$measurements[index] == httpHostsBlockedPerState$Num_error[index]) { + http_hosts_error <- append(http_hosts_error, host) + } + + http_measurements <- http_measurements + httpHostsBlockedPerState$measurements[index] + + if (httpHostsBlockedPerState$Num_blocked[index] >= 1) { + blocklist <- append(blocklist, host) + } + } + http_blanks <- setdiff(Hosts_to_consider, http_hosts_measured) + + + # cat("asn:", asn, " measurements: ", (dns_measurements + tls_measurements + http_measurements), " dns_blocks ", dns_blocks, " tls_blocks ", tls_blocks, " http_blocks ", http_blocks, "for host ", host, "\n") + blocklist <- unique(blocklist) + results4$number_of_blocked_sites[x] <- length(blocklist) + + list_of_inconclusive <- union( union(dns_hosts_error, tls_hosts_error), http_hosts_error) + list_of_inconclusive <- list_of_inconclusive[!list_of_inconclusive %in% blocklist] + results4$number_of_inconclusive[x] <- length(list_of_inconclusive) + + list_of_unmeasured <- union( union(dns_blanks, tls_blanks), http_blanks) + list_of_unmeasured <- list_of_unmeasured[!list_of_unmeasured %in% blocklist] + results4$number_of_unmeasured_sites[x] <- length(list_of_unmeasured) + + results4$measurements[x] <- (dns_measurements + tls_measurements + http_measurements) / (length(Hosts_to_consider) * 3) +} + +results4 <- results4[order(results4$number_of_unmeasured_sites),] +colnames(results4) <- c("Region", "Readings", "Number of Confirmed\nBlocked Sites", "Number of Sites With\nInconclusive Readings") +results4$Readings <- round(results4$Readings, digits = 1) +rownames(results4) <- 1:nrow(results4) +write.csv(results4, "BlocksByRegion.csv", row.names = F) diff --git a/analysis_scripts/query1.R b/analysis_scripts/query1.R new file mode 100644 index 0000000..73bc888 --- /dev/null +++ b/analysis_scripts/query1.R @@ -0,0 +1,78 @@ +dnsprobe_unique_AS <- dnsprobe$distinct("ip_info.asn.asn") +dnsprobe_unique_region <- dnsprobe$distinct("state") +dnsprobe_AS_number_name_map <- dnsprobe$aggregate('[{"$group":{"_id":"$ip_info.asn.asn", "AS_name": { "$first" : "$ip_info.asn.name" }}}]') + +# For each AS number +for (asn in dnsprobe_unique_AS) { + # For each region + for (region in dnsprobe_unique_region) { + # query distinct sites blocked for this asn in this region + query_with_parameters <- paste('{ "result.test_result.censored": "True", "ip_info.asn.asn" :"', asn, '", "state":"', region, '"}', sep = "") + results <- dnsprobe$distinct("hostname", query_with_parameters) + + # if nothing is blocked, move on + if (length(results) == 0) { + next + } + + # map AS number to AS name + AS_name <- dnsprobe_AS_number_name_map$AS_name[which(dnsprobe_AS_number_name_map$"_id" == asn)] + + # print findings + cat("AS number: ", asn, " AS name: ", AS_name, " blocks ", length(results), " websites in region: ", region, " using DNS\n") + } +} + + + + +tlsprobe_unique_AS <- tlsprobe$distinct("ip_info.asn.asn") +tlsprobe_unique_region <- tlsprobe$distinct("state") +tlsprobe_AS_number_name_map <- tlsprobe$aggregate('[{"$group":{"_id":"$ip_info.asn.asn", "AS_name": { "$first" : "$ip_info.asn.name" }}}]') + +# For each AS number +for (asn in tlsprobe_unique_AS) { + # For each region + for (region in tlsprobe_unique_region) { + # query distinct sites blocked for this asn in this region + query_with_parameters <- paste('{ "result.test_result.censored": "True", "ip_info.asn.asn" :"', asn, '", "state":"', region, '"}', sep = "") + results <- tlsprobe$distinct("sniHostname", query_with_parameters) + + # if nothing is blocked, move on + if (length(results) == 0) { + next + } + + # map AS number to AS name + AS_name <- tlsprobe_AS_number_name_map$AS_name[which(tlsprobe_AS_number_name_map$"_id" == asn)] + + # print findings + cat("AS number: ", asn, " AS name: ", AS_name, " blocks ", length(results), " websites in region: ", region, " using SNI\n") + } +} + + +httpprobe_unique_AS <- httpprobe$distinct("ip_info.asn.asn") +httpprobe_unique_region <- httpprobe$distinct("state") +httpprobe_AS_number_name_map <- httpprobe$aggregate('[{"$group":{"_id":"$ip_info.asn.asn", "AS_name": { "$first" : "$ip_info.asn.name" }}}]') + +# For each AS number +for (asn in httpprobe_unique_AS) { + # For each region + for (region in httpprobe_unique_region) { + # query distinct sites blocked for this asn in this region + query_with_parameters <- paste('{ "result.test_result.censored": "True", "ip_info.asn.asn" :"', asn, '", "state":"', region, '"}', sep = "") + results <- httpprobe$distinct("hostname", query_with_parameters) + + # if nothing is blocked, move on + if (length(results) == 0) { + next + } + + # map AS number to AS name + AS_name <- httpprobe_AS_number_name_map$AS_name[which(httpprobe_AS_number_name_map$"_id" == asn)] + + # print findings + cat("AS number: ", asn, " AS name: ", AS_name, " blocks ", length(results), " websites in region: ", region, " using HTTP\n") + } +} \ No newline at end of file diff --git a/analysis_scripts/query2.R b/analysis_scripts/query2.R new file mode 100644 index 0000000..dbc5d8d --- /dev/null +++ b/analysis_scripts/query2.R @@ -0,0 +1,41 @@ +dnsprobe_unique_AS <- dnsprobe$distinct("ip_info.asn.asn") +dnsprobe_AS_number_name_map <- dnsprobe$aggregate('[{"$group":{"_id":"$ip_info.asn.asn", "AS_name": { "$first" : "$ip_info.asn.name" }}}]') + +# For each ASN +for (asn in dnsprobe_unique_AS) { + # Query how many blocked measurements were found for DNS as well as the total number of measurements + query_with_parameters <- paste('{ "result.test_result.censored": "True", "ip_info.asn.asn" :"', asn, '"}', sep = "") + results <- dnsprobe$find(query_with_parameters) + number_of_dns_blocks <- nrow(results) + + query_with_parameters <- paste('{ "ip_info.asn.asn" :"', asn, '"}', sep = "") + results <- dnsprobe$find(query_with_parameters) + total_dns_measurements <- nrow(results) + + # Repeat, for SNI + query_with_parameters <- paste('{ "result.test_result.censored": "True", "ip_info.asn.asn" :"', asn, '"}', sep = "") + results <- tlsprobe$find(query_with_parameters) + number_of_tls_blocks <- nrow(results) + + query_with_parameters <- paste('{ "ip_info.asn.asn" :"', asn, '"}', sep = "") + results <- tlsprobe$find(query_with_parameters) + total_tls_measurements <- nrow(results) + + # Repeat, for HTTP + query_with_parameters <- paste('{ "result.test_result.censored": "True", "ip_info.asn.asn" :"', asn, '"}', sep = "") + results <- httpprobe$find(query_with_parameters) + number_of_http_blocks <- nrow(results) + + query_with_parameters <- paste('{ "ip_info.asn.asn" :"', asn, '"}', sep = "") + results <- httpprobe$find(query_with_parameters) + total_http_measurements <- nrow(results) + + # map AS number to AS name + AS_name <- dnsprobe_AS_number_name_map$AS_name[which(dnsprobe_AS_number_name_map$"_id" == asn)] + + # print findings + cat("AS number: ", asn, " AS name: ", AS_name, " blocks ", number_of_dns_blocks, " using DNS. Total readings for this ASN: ", total_dns_measurements, "\n") + cat("AS number: ", asn, " AS name: ", AS_name, " blocks ", number_of_tls_blocks, " using SNI. Total readings for this ASN: ", total_tls_measurements, "\n") + cat("AS number: ", asn, " AS name: ", AS_name, " blocks ", number_of_http_blocks, " using HTTP. Total readings for this ASN: ", total_http_measurements, "\n") + +} \ No newline at end of file diff --git a/analysis_scripts/query3.R b/analysis_scripts/query3.R new file mode 100644 index 0000000..bda9916 --- /dev/null +++ b/analysis_scripts/query3.R @@ -0,0 +1,269 @@ +dnsprobe_unique_AS <- dnsprobe$distinct("ip_info.asn.asn") +dnsprobe_unique_company_name <- dnsprobe$distinct("ip_info.company.name") + +dnsprobe_AS_name_company_name_map <- dnsprobe$aggregate('[{"$group":{"_id":"$ip_info.company.name", "asn_name": { "$first" : "$ip_info.asn.name" }, "region": { "$first" : "$ip_info.region" }}}]') + + +thing <- dnsprobe$aggregate('[{"$group":{"_id":"$userRecord", "company_name": { "$first" : "$ip_info.company.name" }, "asn_number": {"$first" : "$ip_info.asn.asn"}, "asn_name": { "$first" : "$ip_info.asn.name" }, "region": { "$first" : "$state" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + + + +thing2 <- dnsprobe$aggregate('[{"$group":{"_id": {"host":"$hostname", "asn_number": "$ip_info.asn.asn"}, "company_name": { "$first" : "$ip_info.company.name" }, "asn_number": {"$first" : "$ip_info.asn.asn"}, "asn_name": { "$first" : "$ip_info.asn.name" }, "region": { "$first" : "$ip_info.region" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + + +thing3 <- dnsprobe$aggregate('[{"$group":{"_id": "$hostname", "company_name": { "$first" : "$ip_info.company.name" }, "asn_number": {"$first" : "$ip_info.asn.asn"}, "asn_name": { "$first" : "$ip_info.asn.name" }, "region": { "$first" : "$ip_info.region" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + + +tlsHostsBlockedPerASNPerRegion <- tlsprobe$aggregate('[{"$group":{"_id": {"host":"$sniHostname", "asn_number": "$ip_info.asn.asn", "region": "$state" }, "company_name": { "$first" : "$ip_info.company.name" }, "asn_name": { "$first" : "$ip_info.asn.name" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + + +httpHostsBlocked <- httpprobe$aggregate('[{"$group":{"_id": {"host":"$hostname"}, "company_name": { "$addToSet" : "$ip_info.company.name" }, "asn_name": { "$addToSet" : "$ip_info.asn.name" }, "asn_number": { "$addToSet" : "$ip_info.asn.asn"}, "region": { "$addToSet" : "$state" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +httpHostsBlockedPerASN <- httpprobe$aggregate('[{"$group":{"_id": {"host":"$hostname", "asn_number": "$ip_info.asn.asn"}, "company_name": { "$first" : "$ip_info.company.name" }, "asn_name": { "$first" : "$ip_info.asn.name" }, "region": { "$first" : "$state" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + + +httpHostsBlockedPerASNPerRegion <- httpprobe$aggregate('[{"$group":{"_id": {"host":"$hostname", "asn_number": "$ip_info.asn.asn", "region": "$state" }, "company_name": { "$addToSet" : "$ip_info.company.name" }, "asn_name": { "$addToSet" : "$ip_info.asn.name" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + + +httpHostResponseCode <- httpprobe$aggregate('[{"$group":{"_id": {"host":"$hostname"}, "responseCode": { "$addToSet" : "$responseCode" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + + + +dnsHostsBlocked <- dnsprobe$aggregate('[{"$group":{"_id": {"host":"$hostname"}, "company_name": { "$addToSet" : "$ip_info.company.name" }, "asn_name": { "$addToSet" : "$ip_info.asn.name" }, "asn_number": { "$addToSet" : "$ip_info.asn.asn"}, "region": { "$addToSet" : "$state" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +tlsHostsBlocked <- tlsprobe$aggregate('[{"$group":{"_id": {"host":"$sniHostname"}, "company_name": { "$addToSet" : "$ip_info.company.name" }, "asn_name": { "$addToSet" : "$ip_info.asn.name" }, "asn_number": { "$addToSet" : "$ip_info.asn.asn"}, "region": { "$addToSet" : "$state" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + + +dnsHostsBlockedPerASNPerRegion <- dnsprobe$aggregate('[{"$group":{"_id": {"host":"$hostname", "asn_number": "$ip_info.asn.asn", "region": "$state" }, "company_name": { "$addToSet" : "$ip_info.company.name" }, "asn_name": { "$addToSet" : "$ip_info.asn.name" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +dnsBlocksPerASNPerRegion <- dnsprobe$aggregate('[{"$group":{"_id": {"asn_number": "$ip_info.asn.asn", "region": "$state" }, "company_name": { "$first" : "$ip_info.company.name" }, "asn_name": { "$first" : "$ip_info.asn.name" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +dnsBlocksPerASN <- dnsprobe$aggregate('[{"$group":{"_id": {"ASN Number": "$ip_info.asn.asn" }, "ASN Name": { "$first" : "$ip_info.asn.name" }, "Measurements": { "$sum": 1 }, "Number of Confirmed Blocks": { "$sum": { "$cond": [ { "$eq": [ "$confirmed_block", "True" ] }, 1, 0 ] }} }}]') +tlsBlocksPerASN <- dnsprobe$aggregate('[{"$group":{"_id": {"ASN Number": "$ip_info.asn.asn" }, "ASN Name": { "$first" : "$ip_info.asn.name" }, "Measurements": { "$sum": 1 }, "Number of Confirmed Blocks": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + + +dnsAnomaliesPerASN <- dnsprobe$aggregate('[{"$group":{"_id": {"ASN Number": "$ip_info.asn.asn" }, "ASN Name": { "$first" : "$ip_info.asn.name" }, "Measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$sourceIp", false]}, 1, 0 ] }} , "Number of Confirmed Blocks": { "$sum": { "$cond": [ { "$eq": [ "$confirmed_block", "True" ] }, 1, 0 ] }} }}]') + + +location_comparison <- httpprobe$aggregate('[{"$group":{"_id":"$userRecord", "ip_region": { "$first" : "$ip_info.region" }, "reported_region": { "$first" : "$state"}}}]') + +dnsResponseIPs <- dnsprobe$aggregate('[{"$group":{"_id": "$result.test_result.resolved_ip", "websites": { "$addToSet" : "$hostname" }, "company_name": { "$addToSet" : "$ip_info.company.name" }, "asn_number": { "$addToSet" : "$ip_info.asn.asn"}, "region": { "$addToSet" : "$state"}, "asn_name": { "$addToSet" : "$ip_info.asn.name" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +ASNsPerUserDNS <- dnsprobe$aggregate('[{"$group":{"_id": {"user":"$userRecord", "asn_number": "$ip_info.asn.asn"}, "company_name": { "$first" : "$ip_info.company.name" }, "asn_number": {"$first" : "$ip_info.asn.asn"}, "asn_name": { "$first" : "$ip_info.asn.name" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') +ASNsPerUserHTTP <- httpprobe$aggregate('[{"$group":{"_id": {"user":"$userRecord", "asn_number": "$ip_info.asn.asn"}, "company_name": { "$first" : "$ip_info.company.name" }, "asn_number": {"$first" : "$ip_info.asn.asn"}, "asn_name": { "$first" : "$ip_info.asn.name" }, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +dnsprobe_hosts_unknown <- dnsprobe$distinct("hostname", '{"result.test_result.censored": "Unknown"}') + + +temp <- 0 +for (i in 1:length(location_comparison[,2])) { + if (is.na(location_comparison[i, 2]) ) { + next + } + if (location_comparison[i, 2] != location_comparison[i,3]) { + temp <- temp + 1 + } +} +cat(temp, "out of ", length(location_comparison[,2]), " mismatches") + + + + +#print block pages +httpprobe_unique_AS <- httpprobe$distinct("ip_info.asn.asn") + +httpprobe_AS_number_name_map <- httpprobe$aggregate('[{"$group":{"_id":"$ip_info.asn.asn", "AS_name": { "$first" : "$ip_info.asn.name" }}}]') +blockpage_signatures <- c() +my_ASNs <- c("AS24309", "AS18209", "AS55577") + +# For each AS number +for (asn in httpprobe_unique_AS) { + + # query distinct block pages for this asn + query_with_parameters <- paste('{ "result.test_result.censored": "True", "ip_info.asn.asn" :"', asn, '"}', sep = "") + results <- httpprobe$distinct("result.test_result.response", query_with_parameters) + + # if nothing is blocked, move on + if (length(results) == 0) { + next + } + + # map AS number to AS name + #AS_name <- httpprobe_AS_number_name_map$AS_name[which(httpprobe_AS_number_name_map$"_id" == asn)] + + # print findings + #cat("AS number: ", asn, " AS name: ", AS_name, "uses the following block pages blockpages: \n \n \n", file="output.txt", sep="\n", append=TRUE) + + for (result in results) { + + signature_exists <- FALSE + for (signature in blockpage_signatures) { + if (levenshteinSim(result, signature) >= 0.8) { + signature_exists <- TRUE + break + } + } + + if (signature_exists == FALSE) { + blockpage_signatures <- c(blockpage_signatures, result) + } + #cat("response: ", result, "\n \n \n", file="output.txt", sep="\n", append=TRUE) + } + } + + +#print DNS results for blocked DNS hosts +dnsprobe_unique_AS <- dnsprobe$distinct("ip_info.asn.asn") +dnsprobe_AS_number_name_map <- dnsprobe$aggregate('[{"$group":{"_id":"$ip_info.asn.asn", "AS_name": { "$first" : "$ip_info.asn.name" }}}]') + +# For each AS number +for (asn in dnsprobe_unique_AS) { + + # query distinct block pages for this asn + query_with_parameters <- paste('{ "result.test_result.censored": "True", "ip_info.asn.asn" :"', asn, '"}', sep = "") + results <- dnsprobe$distinct("result.test_result.resolved_ip", query_with_parameters) + + # if nothing is blocked, move on + if (length(results) == 0) { + next + } + + # map AS number to AS name + AS_name <- dnsprobe_AS_number_name_map$AS_name[which(dnsprobe_AS_number_name_map$"_id" == asn)] + + # print findings + cat("AS number: ", asn, " AS name: ", AS_name, "gave the following dns servers in its own AS: \n \n \n") + + temp <- 0 + for (result in results) { + cat("response: ", result, "\n \n \n") + temp <- temp + 1 + if (temp == 20) { + break + } + } +} + + +results <- httpprobe$find('{"result.test_result.censored": "True"}', '{"result.test_result.response": 1}') +suspected_blocks <- unlist(results$result) +blockpage_snippets <- c("requested URL has been blocked", "webadmin/deny/", "The URL you're trying to reach has been blocked", "airtel.in/dot", "The URL you requested has been blocked", "This website/URL has been blocked") +confirmed_blocks <- 0 + +for (suspected_block in suspected_blocks) { + match_found <- FALSE + for (snippet in blockpage_snippets) { + if (grepl(snippet, suspected_block, fixed=TRUE) == TRUE) { + confirmed_blocks <- confirmed_blocks + 1 + match_found <- TRUE + break + } + } + + if (match_found == FALSE) { + cat("No match: ", suspected_block) + } +} +cat("Number of suspected HTTP blocks: ", length(suspected_blocks), ". Number of confirmed blocks: ", confirmed_blocks) + + +dnsprobe_hosts <- dnsprobe$distinct("hostname") +dnsprobe_unique_AS <- dnsprobe$distinct("ip_info.asn.asn") +results <- data.frame(hosts = dnsprobe_hosts) +empty_list <- rep(NA, length(results$hosts)) + +for (x in 1:length(dnsprobe_hosts)) { + for (asn in dnsprobe_unique_AS) { + results[[asn]] = data.frame(dns_measurements = empty_list, dns_blocks = empty_list, tls_measurements = empty_list, tls_blocks = empty_list, http_measurements = empty_list, http_blocks = empty_list) + + # Find DNS blocks + query_with_parameters <- paste('[{"$match": {"hostname": "', dnsprobe_hosts[x], '", "ip_info.asn.asn" :"', asn, '"}}, {"$group":{"_id": {"asn_number": "$ip_info.asn.asn", "host":"$hostname"}, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]', sep = "") + dnsBlocksForASNAndHost <- dnsprobe$aggregate(query_with_parameters) + + # Find TLS blocks + query_with_parameters <- paste('[{"$match": {"sniHostname": "', dnsprobe_hosts[x], '", "ip_info.asn.asn" :"', asn, '"}}, {"$group":{"_id": {"asn_number": "$ip_info.asn.asn", "host":"$hostname"}, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]', sep = "") + tlsBlocksForASNAndHost <- tlsprobe$aggregate(query_with_parameters) + + # Find DNS blocks + query_with_parameters <- paste('[{"$match": {"hostname": "', dnsprobe_hosts[x], '", "ip_info.asn.asn" :"', asn, '"}}, {"$group":{"_id": {"asn_number": "$ip_info.asn.asn", "host":"$hostname"}, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]', sep = "") + httpBlocksForASNAndHost <- httpprobe$aggregate(query_with_parameters) + + #Save results + if (length(dnsBlocksForASNAndHost) != 0) { + results[[asn]]$dns_measurements[x] = dnsBlocksForASNAndHost$measurements + results[[asn]]$dns_blocks[x] = dnsBlocksForASNAndHost$Num_blocked + } + + if (length(tlsBlocksForASNAndHost) != 0) { + results[[asn]]$tls_measurements[x] = tlsBlocksForASNAndHost$measurements + results[[asn]]$tls_blocks[x] = tlsBlocksForASNAndHost$Num_blocked + } + + if (length(httpBlocksForASNAndHost) != 0) { + results[[asn]]$http_measurements[x] = httpBlocksForASNAndHost$measurements + results[[asn]]$http_blocks[x] = httpBlocksForASNAndHost$Num_blocked + } + View(results) + } +} + +httpHostsBlockedPerASN <- httpprobe$aggregate('[{"$group":{"_id": {"host":"$hostname", "asn_number": "$ip_info.asn.asn"}, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') +dnsHostsBlockedPerASN <- dnsprobe$aggregate('[{"$group":{"_id": {"host":"$hostname", "asn_number": "$ip_info.asn.asn"}, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') +tlsHostsBlockedPerASN <- tlsprobe$aggregate('[{"$group":{"_id": {"host":"$sniHostname", "asn_number": "$ip_info.asn.asn"}, "measurements": { "$sum": 1 }, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +dnsprobe_hosts <- dnsprobe$distinct("hostname") +dnsprobe_unique_AS <- dnsprobe$distinct("ip_info.asn.asn") +results2 <- data.frame(hosts = dnsprobe_hosts) +empty_list <- rep(NA, length(results$hosts)) + +for (x in 9:10) { + hostname <- dnsprobe_hosts[x] + for (asn in dnsprobe_unique_AS) { + if (is.null(results2[[asn]])) { + results2[[asn]] = data.frame(dns_measurements = empty_list, dns_blocks = empty_list, tls_measurements = empty_list, tls_blocks = empty_list, http_measurements = empty_list, http_blocks = empty_list) + } + index <- which(dnsHostsBlockedPerASN$"_id"$host == hostname & dnsHostsBlockedPerASN$"_id"$asn_number == asn) + if(!identical(index, integer(0))){ + results2[[asn]]$dns_measurements[x] = dnsHostsBlockedPerASN$measurements[index] + results2[[asn]]$dns_blocks[x] = dnsHostsBlockedPerASN$Num_blocked[index] + } + + index <- which(tlsHostsBlockedPerASN$"_id"$host == hostname & tlsHostsBlockedPerASN$"_id"$asn_number == asn) + if(!identical(index, integer(0))){ + results2[[asn]]$tls_measurements[x] = tlsHostsBlockedPerASN$measurements[index] + results2[[asn]]$tls_blocks[x] = tlsHostsBlockedPerASN$Num_blocked[index] + } + + index <- which(httpHostsBlockedPerASN$"_id"$host == hostname & httpHostsBlockedPerASN$"_id"$asn_number == asn) + if(!identical(index, integer(0))){ + results2[[asn]]$http_measurements[x] = httpHostsBlockedPerASN$measurements[index] + results2[[asn]]$http_blocks[x] = httpHostsBlockedPerASN$Num_blocked[index] + } + + } +} + +for (asn in ASNs_to_consider) { + httpcount <- 0 + for (measurement in results2[[asn]]$http_measurements) { + if(is.na(measurement)) { + httpcount <- httpcount + 1 + } + } + + dnscount <- 0 + for (measurement in results2[[asn]]$dns_measurements) { + if(is.na(measurement)) { + dnscount <- dnscount + 1 + } + } + + tlscount <- 0 + for (measurement in results2[[asn]]$tls_measurements) { + if(is.na(measurement)) { + tlscount <- tlscount + 1 + } + } + + cat("ASN: ", asn, " has: ", httpcount, " HTTP blanks, ", dnscount, " DNS blanks, and ", tlscount, " TLS blanks \n") +} diff --git a/analysis_scripts/query4.R b/analysis_scripts/query4.R new file mode 100644 index 0000000..cd4a3e9 --- /dev/null +++ b/analysis_scripts/query4.R @@ -0,0 +1,118 @@ +httpHostsBlockedPerASN <- httpprobe$aggregate('[{"$group":{"_id": {"host":"$hostname", "asn_number": "$ip_info.asn.asn"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error", false]}, 1, 0 ] }}, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') +dnsHostsBlockedPerASN <- dnsprobe$aggregate('[{"$group":{"_id": {"host":"$hostname", "asn_number": "$ip_info.asn.asn"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error", false]}, 1, 0 ] }}, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$confirmed_block", "True" ] }, 1, 0 ] }} }}]') +tlsHostsBlockedPerASN <- tlsprobe$aggregate('[{"$group":{"_id": {"host":"$sniHostname", "asn_number": "$ip_info.asn.asn"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error", false]}, 1, 0 ] }}, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +dnsprobe_unique_AS <- dnsprobe$distinct("ip_info.asn.asn") +httpprobe_unique_AS <- httpprobe$distinct("ip_info.asn.asn") +httpprobe_hosts <- httpprobe$distinct("hostname") + +unique_ASNs <- union(httpprobe_unique_AS, dnsprobe_unique_AS) +ASNs_to_consider <- unique_ASNs[!unique_ASNs %in% bad_measurements] +Hosts_to_consider <- httpprobe_hosts[!httpprobe_hosts %in% bad_websites] +results2 <- data.frame(hosts = Hosts_to_consider) +empty_list <- rep(NA, length(results2$hosts)) +for (asn in ASNs_to_consider) { + if (is.null(results2[[asn]])) { + results2[[asn]] = data.frame(dns_measurements = empty_list, dns_blocks = empty_list, tls_measurements = empty_list, tls_blocks = empty_list, http_measurements = empty_list, http_blocks = empty_list) + } +} + + +for(x in 1:length(Hosts_to_consider)) { + hostname <- Hosts_to_consider[x] + + indexes <- which(dnsHostsBlockedPerASN$"_id"$host == hostname) + for(index in indexes) { + asn <- dnsHostsBlockedPerASN$"_id"$asn_number[index] + if (is.na(asn) || is.null(results2[[asn]])) { + next + } + + results2[[asn]]$dns_measurements[x] = dnsHostsBlockedPerASN$measurements[index] + results2[[asn]]$dns_blocks[x] = dnsHostsBlockedPerASN$Num_blocked[index] + } + + indexes <- which(tlsHostsBlockedPerASN$"_id"$host == hostname) + for(index in indexes) { + asn <- tlsHostsBlockedPerASN$"_id"$asn_number[index] + if (is.na(asn) || is.null(results2[[asn]])) { + next + } + + results2[[asn]]$tls_measurements[x] = tlsHostsBlockedPerASN$measurements[index] + results2[[asn]]$tls_blocks[x] = tlsHostsBlockedPerASN$Num_blocked[index] + } + + indexes <- which(httpHostsBlockedPerASN$"_id"$host == hostname) + for(index in indexes) { + asn <- httpHostsBlockedPerASN$"_id"$asn_number[index] + if (is.na(asn) || is.null(results2[[asn]])) { + next + } + + results2[[asn]]$http_measurements[x] = httpHostsBlockedPerASN$measurements[index] + results2[[asn]]$http_blocks[x] = httpHostsBlockedPerASN$Num_blocked[index] + } +} + + +errorCounts <- httpprobe$aggregate('[{"$group":{"_id": {"host":"$hostname"}, "measurements": { "$sum": 1 },"Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error", false]}, 1, 0 ] }}, "Num_timeouts": { "$sum": { "$cond": [ {"$regexMatch": {"input": "$error", "regex": ".*Timeout*."} }, 1, 0 ] }}, "Num_NotFounds": { "$sum": { "$cond": [ {"$regexMatch": {"input": "$error", "regex": ".*NotFound*."} }, 1, 0 ] }}, "Num_socket_err": { "$sum": { "$cond": [ {"$regexMatch": {"input": "$error", "regex": ".*SocketException*."} }, 1, 0 ] }}, "Num_connect_err": { "$sum": { "$cond": [ {"$regexMatch": {"input": "$error", "regex": ".*ConnectException*."} }, 1, 0 ] }}, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + + +errorCountsByUser <- httpprobe$aggregate('[{"$group":{"_id": {"user":"$userRecord"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error", false]}, 1, 0 ] }},"Num_timeouts": { "$sum": { "$cond": [ {"$regexMatch": {"input": "$error", "regex": ".*Timeout*."} }, 1, 0 ] }}, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +errorCountsByASN <- httpprobe$aggregate('[{"$group":{"_id": {"user":"$ip_info.asn.asn"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error", false]}, 1, 0 ] }},"Num_timeouts": { "$sum": { "$cond": [ {"$regexMatch": {"input": "$error", "regex": ".*Timeout*."} }, 1, 0 ] }}, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +errorCountsByIP <- httpprobe$aggregate('[{"$group":{"_id": {"source":"$sourceIp"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error", false]}, 1, 0 ] }},"Num_timeouts": { "$sum": { "$cond": [ {"$regexMatch": {"input": "$error", "regex": ".*Timeout*."} }, 1, 0 ] }}, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +errorCountsByHost <- httpprobe$aggregate('[{"$group":{"_id": {"host":"$hostname"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error", false]}, 1, 0 ] }},"Num_timeouts": { "$sum": { "$cond": [ {"$regexMatch": {"input": "$error", "regex": ".*Timeout*."} }, 1, 0 ] }}, "Num_blocked": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "True" ] }, 1, 0 ] }} }}]') + +unknownCountsByHost <- dnsprobe$aggregate('[{"$group":{"_id": {"host":"$hostname"}, "measurements": { "$sum": 1 }, "Num_error": { "$sum": { "$cond": [ {"$ifNull": ["$error", false]}, 1, 0 ] }}, "Num_unknown": { "$sum": { "$cond": [ { "$eq": [ "$result.test_result.censored", "Unknown" ] }, 1, 0 ] }} }}]') + +unknownCountsByHost$present <- FALSE +for (x in 1:length(unknownCountsByHost$"_id"$host)) { + host <- unknownCountsByHost$"_id"$host[x] + if (host %in% Hosts_to_consider) { + unknownCountsByHost$present[x] <- TRUE + } +} + +errorCountsByHost$errorRate <- 0 +for (x in 1:length(errorCountsByHost$"_id"$host)) { + host <- errorCountsByHost$"_id"$host[x] + errorCountsByHost$errorRate[x] <- errorCountsByHost$Num_error[x]/errorCountsByHost$measurements[x] +} + +high_error_websites <- c() +for (x in 1:length(errorCountsByHost$"_id"$host)) { + host <- errorCountsByHost$"_id"$host[x] + if (errorCountsByHost$errorRate[x] >= 0.85) { + high_error_websites <- append(high_error_websites, host) + } +} + +ASNs_to_consider <- unique_ASNs[!unique_ASNs %in% bad_measurements] +blanksByHost2 <- data.frame(hosts = results2$hosts) +blanksByHost2$Num_blanks <- 0 +blanksByHost2$ASNs <- " " +for(x in 1:length(results2$hosts)) { + hostname <- results2$hosts[x] + for (asn in ASNs_to_consider) { + if (is.na(results2[[asn]]$http_measurements[x])) { + blanksByHost2$Num_blanks[x] <- blanksByHost2$Num_blanks[x] + 1 + blanksByHost2$ASNs[x] <- paste(blanksByHost2$ASNs[x], asn) + } + } +} + +bad_websites <- c() +for(x in 1:length(blanksByHost$hosts)) { + hostname <- blanksByHost$hosts[x] + if (blanksByHost$Num_blanks[x] >= 10) { + bad_websites <- append(bad_websites, hostname) + } +} + +pdf("mypdf.pdf", height=18, width=17) +grid.table(results3) +dev.off() \ No newline at end of file