diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..2c9581b --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,15 @@ +Package: osc +Type: Package +Title: Orthodromic Spatial Clustering +Version: 1.0.0 +Date: 2016-05-10 +Depends: raster, R (>= 2.14) +Suggests: testthat +Author: Steffen Kriewald, Till Fluschnik, Dominik Reusser, Diego Rybski +Maintainer: Steffen Kriewald +Description: Allows distance based spatial clustering of georeferenced data by implementing the City Clustering Algorithm - CCA. Multiple versions allow clustering for matrix, raster and single coordinates on a plain (euclidean distance) or on a sphere (great-circle or orthodromic distance). +License: GPL +NeedsCompilation: yes +Packaged: 2016-05-10 13:05:35 UTC; kriewald +Repository: CRAN +Date/Publication: 2016-05-10 19:05:05 diff --git a/MD5 b/MD5 new file mode 100644 index 0000000..1da00fd --- /dev/null +++ b/MD5 @@ -0,0 +1,34 @@ +f48a7147e02c0724e05b6358e127ab85 *DESCRIPTION +33268da2b5a0174d693206835bc49e9e *NAMESPACE +3b7548d94355d3aac0d7a20f96f8bf3f *R/cca.R +456b71ba9225142206fdc6d41077d4f2 *R/cca.single.R +911fcd63d0bc1d8c0ae950319404655d *R/code_revised_cca.R +ebd670ecc096df9ce8e3f53dcc0419cb *R/getPart.R +94bdc1f82757c353cb1003e56241cdd6 *build/vignette.rds +6d7bc5a2d4e9a0646afa0c7e770a26bb *data/datalist +58fd5c862c5b503e50d5245be0be11d4 *data/exampledata.rda +e4610df58b13625c9b23bd995a55faff *data/landcover.rda +4c396c53f51248f850075e93a4d3ddfc *data/population.rda +971ec966bdc8fc42d3f2eab9a45fc424 *inst/doc/paper.R +b0f652cbee75143ec695dc018de995b2 *inst/doc/paper.pdf +bda20f004bf359c3fffe7c064e7dfc40 *inst/doc/paper.rnw +71ed23ec67d3a47015db2b0b8dc06b07 *man/assign.data.Rd +f784386850049b09d50f411c8cca098f *man/cca.Rd +a48d5876310a824b38f8bedd1fe49b96 *man/coordinate.list.Rd +caf4c194ad1c8880a0991f019062f05f *man/exampledata.Rd +83dddfbd273fc93fc26ca39690fa42b7 *man/landcover.Rd +0d846cb42d6a64028ee1bf6b573ce27c *man/osc.buffer.Rd +3fefbda3fcd0894582ba971665e7c3fc *man/population.Rd +436aff5a71c41276d06bbd6a31f2b4e6 *src/cca-core.c +6e3121545e3e3dc8caabba7ba9200906 *src/ccaRevolution.c +251bdb10958e0b5ac1737af4a05d85e2 *tests/testthat.R +35f54901471879ab9191912c70ffc422 *tests/testthat/test_cca_multi.R +1173e8ffffc2fc75fee4a0260c93a919 *tests/testthat/test_cca_single.R +d8266d70da0e7737622bb663ea664944 *tests/testthat/test_coordinate_list.R +716806c191b146997b4f6aaa4bd3be08 *vignettes/compare.pdf +bda20f004bf359c3fffe7c064e7dfc40 *vignettes/paper.rnw +49979365c9b21512171323f8aebf9c87 *vignettes/pics/exdat1.pdf +1fdeb0ba794228c0c0a232b901148d0c *vignettes/pics/exdat2.pdf +8183cf69156264257790404ae1a68a15 *vignettes/pics/fig-concordance.tex +63dcbf8e81d4a3f795ff4ce9bc7ed9be *vignettes/pics/landcover.pdf +b273fda9b9218cf2c707d4a4a5bd8b7a *vignettes/pics/raster.pdf diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..6a3b2f8 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,5 @@ +useDynLib(osc) + +export(cca, cca.single, assign.data, osc.buffer, coordinate.list) +import(raster) +importFrom("utils", "setTxtProgressBar", "txtProgressBar") diff --git a/R/cca.R b/R/cca.R new file mode 100644 index 0000000..3e10895 --- /dev/null +++ b/R/cca.R @@ -0,0 +1,32 @@ +cca <- function(data, s=1, mode=3, count.cells=FALSE, count.max=ncol(data)*3, res.x=NULL, res.y=NULL, cell.class=1, unit="", compare=""){ + if(class(data) %in% c("data.frame", "RasterLayer")){ + if(unit=="m"){ + ccaRm(data=data, d=s,res.x=res.x, res.y=res.y, cell.class=cell.class, compare) + } else { + ccaRd(data=data, d=s, cell.class=cell.class, compare) + } + } else { + ccaM(data=data, s=s,mode=mode, count.cells=count.cells, count.max=count.max) + } + +} +ccaM <- function(data, s=1, mode=3, count.cells=FALSE, count.max=ncol(data)*3){ + #do checks + stopifnot(is.numeric(data)) + stopifnot(is.matrix(data)) + stopifnot(is.numeric(s)) + stopifnot(mode==1 | mode==2 | mode==3) + the.data <- as.integer(t(data)) + clu <- as.integer(rep(0, ncol(data)*nrow(data))) + count.max <- as.integer(count.max) + count <- as.integer(rep(0, count.max)) + if(count.cells==TRUE){ + out <- .C("callburn_count", s=as.integer(s), xmax=nrow(data), ymax=ncol(data), mode=as.integer(mode)[1], data=the.data, clu=clu, count=count, count.max=count.max,CLASSES=c("integer", "integer", "integer", "integer", "integer","integer")) + return(list(clusters=matrix(out$clu, ncol=ncol(data), byrow=TRUE), cluster.count=out$count[1:max(out$clu)])) + } + if(count.cells==FALSE){ + out <- .C("callburn", s=as.integer(s), xmax=nrow(data), ymax=ncol(data), mode=as.integer(mode)[1], data=the.data, clu=clu, count=count, count.max=count.max,CLASSES=c("integer", "integer", "integer", "integer", "integer","integer")) + return(clusters=matrix(out$clu, ncol=ncol(data), byrow=TRUE)) + } +} + diff --git a/R/cca.single.R b/R/cca.single.R new file mode 100644 index 0000000..b6266c0 --- /dev/null +++ b/R/cca.single.R @@ -0,0 +1,29 @@ +cca.single <- function(data, s, x,y, mode=3){ + #do checks + stopifnot(is.numeric(data)) + stopifnot(is.matrix(data)) + stopifnot(is.numeric(s)) + stopifnot(is.numeric(x)) + stopifnot(is.numeric(y)) + x <- as.integer(x) - 1 # as C starts counting at 0 + y <- as.integer(y) - 1 # as C starts counting at 0 + xmax <- nrow(data) + ymax <- ncol(data) + stopifnot(x>=0 & x=0 & y cell.class))}} + else{ if(compare=="s"){compare <- function(v){return(which(v < cell.class))}} + else{compare <- function(v){return(which(v %in% cell.class))}} + } + + + all <- NULL + coordinates <- NULL + + bs <- blockSize(raster) + + print("get coordinates:") + # create progress bar + pb <- txtProgressBar(min = 0, max = bs$n, style = 3) + + for (i in 1:bs$n) { + v <- getValues(raster, row=bs$row[i], nrows=bs$nrows[i] ) + v <- compare(v) + + if(length(v)>0){ + coordinates <- xyFromCell(object=raster, cell= (raster@ncols * (bs$row[i]-1)) +v) + } + all <- rbind(all, coordinates) + coordinates <- NULL + setTxtProgressBar(pb, i) + } + + close(pb) + + return(all) +} + +######################################################### +# cluster urban coordinates +######################################################### + + +ccaR.order <- function(m) { m <- m[order(-m[,2]),]; m <- m[order(m[,1]),]; return(m); } + +ccaRm <- function(data="", d=500, res.x=NULL, res.y=NULL, cell.class, compare) +{ + + # If raster get coordinate list + if(class(data)=="RasterLayer"){ + res.x <- xres(data) + res.y <- yres(data) + data <- coordinate.list(data, cell.class, compare) + } + + # preprocessing + n1 <- length(data[,2]); + data <- ccaR.order(data); + data <- data[order(data[,2]),]; + print("Sorting... Done"); + data[,1] <- data[,1]*pi/180; + data[,2] <- data[,2]*pi/180; + + # calculate exit condition/distance + max_abs <- max(abs(data[,2])); + min_dist <- acos(sin(max_abs)*sin(max_abs)+cos(max_abs)*cos(max_abs)*cos(res.y*pi/180))*6371000; + min_dist_h <- acos(sin(0)*sin(res.x*pi/180)+cos(0)*cos(res.x*pi/180)*cos(0))*6371000; + + # create place holder for cluster id + data <- cbind(data, rep(0,length(data[,1]))) + data <- as.matrix(data) + data <- as.numeric(data[,1:3]); + + step_w <- floor(1+(d/min_dist)); + step_h <- ceiling(d/min_dist_h); + print("Start Clustering..."); + out <- .C("ccaRevT", m=as.numeric(data), n=as.integer(n1), l=as.numeric(d), + step_w=as.integer(step_w), step_h=as.integer(step_h), + res_x=as.numeric(res.x*pi/180), res_y=as.numeric(res.y*pi/180), + w=as.integer(array(0,n1))); + rm(data) + print("Clustering... Done"); + mat1 <- matrix(out$m, ncol=3, byrow=FALSE); + m1 <- array(0, max(mat1[,3])); + m1 <- .C("ccaSumT", m=as.numeric(mat1), m3=as.integer(mat1[,3]), mm=as.numeric(m1), n=as.integer(n1)); + print("Summary... Done"); + + mat1[,1] <- mat1[,1]*180/pi; + mat1[,2] <- mat1[,2]*180/pi; + mat1 <- as.data.frame(mat1) + colnames(mat1) <- c("long","lat","cluster_id") + return(list(cluster=mat1,size=m1$mm)); +} + + +ccaRd <- function(data="", d=500, cell.class, compare) +{ + + # If raster get coordinate list + if(class(data)=="RasterLayer"){ + data <- coordinate.list(data, cell.class, compare) + } + + # preprocessing + n1 <- length(data[,2]); + #data <- ccaR.order(data); + data <- data[order(data[,1]),]; + print("Sorting... Done"); + + # create place holder for cluster id + data <- cbind(data, rep(0,length(data[,1]))) + data <- as.matrix(data) + data <- as.numeric(data[,1:3]); + + print("Start Clustering..."); + out <- .C("ccaRev", m=as.numeric(data), n=as.integer(n1), l=as.numeric(d),w=as.integer(array(0,n1))); + rm(data) + print("Clustering... Done"); + mat1 <- matrix(out$m, ncol=3, byrow=FALSE); + m1 <- array(0, max(mat1[,3])); + m1 <- .C("ccaSum", m=as.numeric(mat1), m3=as.integer(mat1[,3]), mm=as.numeric(m1), n=as.integer(n1)); + print("Summary... Done"); + + mat1 <- as.data.frame(mat1) + colnames(mat1) <- c("long","lat","cluster_id") + return(list(cluster=mat1,size=m1$mm)); +} + +######################################################### +# population assignment to clusters +######################################################### + +assign.data <- function(cluster, points, dist=1000){ + + # conversion from meter to degree + wgs84 <- 6371000 + scale <- 360 / (2*pi*wgs84) + y_dist <- dist * scale + + points <- cbind(points, cluster_id=rep(0, length(points[, 1]))) + + pop_cluster <- NULL + + for(j in 1:length(points[, 1])){ + #print(j) + x <- points$long[j] + y <- points$lat[j] + x_dist <- dist * scale * (1 / cos( y*pi / 180 )) + temp <- which((x-x_dist) < cluster[, 1] & + cluster[, 1] < (x+x_dist) & + (y-y_dist) < cluster[, 2] & + cluster[, 2] < (y+y_dist)) + if(length(temp)>0){ + pdist <- pointDistance(c(x, y), matrix(c(cluster[temp, 1],cluster[temp, 2]), + ncol=2, byrow=FALSE), longlat=TRUE) + if(min(pdist)< dist){ + index <- which(pdist[]==min(pdist)) + cluster_id <- cluster[temp[index], 3] + points$cluster_id[j] <- cluster_id + } + } + } + return(points) +} + +######################################################### +# buffer +######################################################### + +osc.buffer <- function(input, width) +{ + if(class(input)=="RasterLayer"){ + m <- matrix(input[], nrow=input@nrows, byrow=TRUE) + returnraster <- T}else{ + m <- input + returnraster <- F + } + m1 <- .C("ccaBuffED", m=as.integer(m), nr=as.integer(dim(m)[1]), nc=as.integer(dim(m)[2]), sz=as.integer(width)) + m1 <- m1$m +# m1[which(m1<0)] <- -1 + m1 <- matrix(m1, nrow=dim(m)[1]) + if(returnraster){ + input[] <- raster(m1)[] + return(input) + }else{ + return(m1) + } +} diff --git a/R/getPart.R b/R/getPart.R new file mode 100644 index 0000000..81dc8eb --- /dev/null +++ b/R/getPart.R @@ -0,0 +1,38 @@ +cca.row <- function(pop, row){ + stopifnot(is.numeric(pop)) + stopifnot(is.matrix(pop)) + stopifnot(is.numeric(row)) + stopifnot(row<=NROW(pop)) + stopifnot(row>=0) + the.pop <- as.integer(t(pop)) + clu <- rep(999, ncol(pop)) + out <- .C("getrow", col=as.integer(row-1), xmax=nrow(pop), ymax=ncol(pop), pop=as.integer(the.pop), ret=as.integer(clu)) + + return(out$ret) +} +cca.col <- function(pop, col){ + stopifnot(is.numeric(pop)) + stopifnot(is.matrix(pop)) + stopifnot(is.numeric(col)) + stopifnot(col<=NCOL(pop)) + stopifnot(col>=1) + the.pop <- as.integer(t(pop)) + clu <- rep(999, nrow(pop)) + out <- .C("getcol", col=as.integer(col-1), xmax=nrow(pop), ymax=ncol(pop), pop=as.integer(the.pop), ret=as.integer(clu)) + + return(out$ret) +} +#cca.block <- function(pop, row,col, drow,dcol){ +# stopifnot(is.numeric(pop)) +# stopifnot(is.matrix(pop)) +# stopifnot(is.numeric(row)) +# stopifnot(row+drow<=NROW(pop)) +# stopifnot(col+dcol<=NROW(pop)) +# stopifnot(row>=1) +# stopifnot(col>=1) +# the.pop <- as.integer(t(pop)) +# clu <- rep(999, dcol*drow) +# out <- .C("getblock", row=as.integer(row-1), col=as.integer(col-1),drow=as.integer(drow),dcol=as.integer(dcol),xmax=nrow(pop), ymax=ncol(pop), pop=the.pop, ret=as.integer(clu)) +# +# return(matrix(out$ret, ncol=dcol)) +#} diff --git a/build/vignette.rds b/build/vignette.rds new file mode 100644 index 0000000..2934402 Binary files /dev/null and b/build/vignette.rds differ diff --git a/data/datalist b/data/datalist new file mode 100644 index 0000000..b874fa2 --- /dev/null +++ b/data/datalist @@ -0,0 +1,3 @@ +exampledata +landcover +population diff --git a/data/exampledata.rda b/data/exampledata.rda new file mode 100644 index 0000000..ad27f9a Binary files /dev/null and b/data/exampledata.rda differ diff --git a/data/landcover.rda b/data/landcover.rda new file mode 100644 index 0000000..7d91905 Binary files /dev/null and b/data/landcover.rda differ diff --git a/data/population.rda b/data/population.rda new file mode 100644 index 0000000..833c479 Binary files /dev/null and b/data/population.rda differ diff --git a/inst/doc/paper.R b/inst/doc/paper.R new file mode 100644 index 0000000..75458cf --- /dev/null +++ b/inst/doc/paper.R @@ -0,0 +1,161 @@ +### R code from vignette source 'paper.rnw' +### Encoding: UTF-8 + +################################################### +### code chunk number 1: paper.rnw:39-44 +################################################### +library(osc) +data(exampledata) +str(exampledata) +pop.list <- cca(exampledata[,1:2],s=1) +str(pop.list) + + +################################################### +### code chunk number 2: paper.rnw:49-56 +################################################### + palette(rainbow(12)) + pdf(file="pics/exdat1.pdf", paper="special", width=8, height=4) + par(mfrow=c(1,2)) + plot(exampledata$x,exampledata$y,col="darkblue", pch=15, xlab="", ylab="", cex=1.2) + plot(pop.list$cluster$long,pop.list$cluster$lat,col=pop.list$cluster$cluster_id, pch=15, xlab="", ylab="", cex=1.2) + dev.off() + palette("default") + + +################################################### +### code chunk number 3: paper.rnw:69-76 +################################################### +#initiate empty matrix +exampledata.pop <- matrix(0, nrow=max(exampledata$x), ncol=max(exampledata$y)) + +#restructure data +for(i in 1:NROW(exampledata)){ + exampledata.pop[exampledata$x[i],exampledata$y[i]] <- exampledata$z[i] +} + + +################################################### +### code chunk number 4: paper.rnw:81-83 +################################################### +example.result <- cca(exampledata.pop, s=1) +str(example.result) + + +################################################### +### code chunk number 5: paper.rnw:88-93 +################################################### + pdf(file="pics/exdat2.pdf", paper="special", width=8, height=4) + par(mfrow=c(1,2)) + image(x=1:20, y=1:20,exampledata.pop, xlab="", ylab="") + image(x=1:20, y=1:20,example.result, col=c("white",rep(rainbow(12),2)), xlab="", ylab="") + dev.off() + + +################################################### +### code chunk number 6: paper.rnw:112-118 +################################################### +# create a raster and set the projection information +raster <- raster(extent(0,5,0,5),nrow=5,ncol=5) +raster[c(1,2,3,5,6,10,17,18,22,23,24)] <- 1 +proj4string(raster) <- CRS("+proj=longlat") +# get a feeling for the dimensions +summary(distance(raster)[]) + + +################################################### +### code chunk number 7: paper.rnw:121-126 +################################################### +# cluster all cells with value 1 +# for various cluster distances of 150, 240 and 111 km +cluster <- cca(raster, cell.class=1, s=1.5e+05, unit="m") +cluster2 <- cca(raster, cell.class=1, s=2.4e+05, unit="m") +cluster3 <- cca(raster, cell.class=1, s=1.11e+05, unit="m") + + +################################################### +### code chunk number 8: paper.rnw:133-134 +################################################### +pixel <- cca(raster, cell.class=1, s=1) + + +################################################### +### code chunk number 9: paper.rnw:136-137 +################################################### +str(pixel) + + +################################################### +### code chunk number 10: paper.rnw:142-151 +################################################### +pdf("pics/raster.pdf", width=9, height=3) +par(mfrow=c(1,3)) +raster[cellFromXY(raster,cluster$cluster[,1:2])] <- cluster$cluster[,3] +plot(raster,col=rainbow(11),legend=FALSE, main="s = 150 km") +raster[cellFromXY(raster,cluster2$cluster[,1:2])] <- cluster2$cluster[,3] +plot(raster,col=rainbow(11),legend=FALSE, main="s = 240 km") +raster[cellFromXY(raster,cluster3$cluster[,1:2])] <- cluster3$cluster[,3] +plot(raster,col=rainbow(11),legend=FALSE, main="s = 111 km") +dev.off() + + +################################################### +### code chunk number 11: paper.rnw:164-166 +################################################### +data("landcover") +cities <- cca(landcover, cell.class=1, s=2000, unit="m") + + +################################################### +### code chunk number 12: paper.rnw:171-174 +################################################### +str(cities) +result <- landcover*NA +result[cellFromXY(result, cities$cluster[,1:2])] <- cities$cluster[,3] + + +################################################### +### code chunk number 13: paper.rnw:179-185 +################################################### +pdf("pics/landcover.pdf", width=8, height=6) +par(mfrow=c(1,2)) +cols <- c("lightblue", "darkred","yellow","orange","green","darkgreen") +plot(landcover, col=cols) +plot(result, col=rainbow(9)) +dev.off() + + +################################################### +### code chunk number 14: paper.rnw:205-234 (eval = FALSE) +################################################### +## library(osc) +## data("population") +## +## tl <- rep(NA,100) +## tm <- rep(NA,100) +## +## for(i in 1:100){ +## print(i) +## tm[i] <- system.time(mat <- cca(population, s=i, mode=3))[[1]] +## } +## +## index <- which(population[]>0) +## long <- ceiling(index/nrow(population)) +## lat <- index - ((long-1)*nrow(population)) +## popl <- data.frame(long, lat, cluster_id=rep(0, length(lat))) +## +## for(i in 1:100){ +## print(i) +## tl[i] <- system.time(list <- cca(popl,s=i) )[[1]] +## } +## +## saveRDS(tm, "tm.rds") +## saveRDS(tm, "tl.rds") +## +## pdf("compare.pdf", width=5, height=4) +## plot(tm, xlab="cluster distance", ylab="time in s", col="darkblue", log="", type="l") +## lines(tl, col="darkred") +## legend("topleft", legend=c("matrix","list"), col=c("darkblue","darkred"), pch="-") +## dev.off() + + diff --git a/inst/doc/paper.pdf b/inst/doc/paper.pdf new file mode 100644 index 0000000..d05fb5c Binary files /dev/null and b/inst/doc/paper.pdf differ diff --git a/inst/doc/paper.rnw b/inst/doc/paper.rnw new file mode 100644 index 0000000..42de8df --- /dev/null +++ b/inst/doc/paper.rnw @@ -0,0 +1,254 @@ +\documentclass[10pt,a4paper]{article} + +% Umlaute und neue deutsche Rechtschreibung +\usepackage[english]{babel} +% Kodierung (meistens utf8: OS X, Linux; latin1: Windows) +\usepackage[utf8]{inputenc} +% mathematische Symbole und Umgebungen +\usepackage{amsmath,amsfonts,amssymb} +\usepackage{graphicx} +% Sweave.sty angeben, gegebenfalls mit Pfad +\usepackage{Sweave} +\SweaveOpts{prefix.string=pics/fig} +% \VignetteIndexEntry{Using the City Clustering Algorithm} +% \VignetteKeywords{R, raster} +% \VignettePackage{cca} + + +\begin{document} +\SweaveOpts{concordance=TRUE} + +\title{Introduction to the City Clustering Algorithm} +\author{Steffen Kriewald} +\date{\today} +\maketitle +\tableofcontents + +\section{Introduction} + +This vignette describes first steps with the R package of the City Clustering Algorithm (CCA). CCA allows to cluster a specific value in a 2-dimensional data-set. This algorithm was originally used to identify cities based on clustered population- or land-cover-data, but can be applied in multiple cases. It was also used to identify hydrological connected areas based on digital elevation models. + +The clustering algorithm based on the burning algorithm presented in Rozenfeld et al.[1] and is implemented in two versions: a matrix based and a list based. The differences in run time and memory use will be discussed in section \ref{sec:compare}. The list-based algorithm can handle geo-referenced data and offers full integration of raster objects. + +For the problem of cities the method in general can be described as following: The CCA selects an urban cell and burns it in a first step. "burns" means the urban cell will be marked and then belongs to a specific cluster. Then the cca starts an iterative burning of all neighboring cells until all neighboring cells are non-urban cells. Another possibility is to allow small gaps between cells, by introducing a cluster distance. These small gaps are important as cities can be divided by rivers or through data processing. + +\section{First clustering} +\label{sec:first} + +We will start with a small example, using the exampledata. The data frame contains two columns representing the rows and columns, namely x and y. We will cluster the given points with a cluster distance s = 1. Here a cluster distance of s = 1 is equal to the pixel width. +<<>>= +library(osc) +data(exampledata) +str(exampledata) +pop.list <- cca(exampledata[,1:2],s=1) +str(pop.list) +@ + +In this example the cluster distance s=1 is equal to rook’s case and consequently only four neighboring cells will be joined to the cluster and no diagonal neighbors, see figure \ref{fig:exdat1}. The result, pop.list, is a list with two entries. The first contains a data frame with the original coordinates followed by a column representing the cluster by an identification number (id). The second is a a vector giving the size of the cluster. First number is the size of the cluster with cluster id = 1, second the size of cluster with cluster id = 2, and so on. + +<>= + palette(rainbow(12)) + pdf(file="pics/exdat1.pdf", paper="special", width=8, height=4) + par(mfrow=c(1,2)) + plot(exampledata$x,exampledata$y,col="darkblue", pch=15, xlab="", ylab="", cex=1.2) + plot(pop.list$cluster$long,pop.list$cluster$lat,col=pop.list$cluster$cluster_id, pch=15, xlab="", ylab="", cex=1.2) + dev.off() + palette("default") +@ + +\begin{figure} +\centering +\includegraphics{pics/exdat1.pdf} +\caption{Clustering based on a data frame with a cluster distance equal to the pixel width. Only the four directly connected neighboring cells are joined to a cluster and no diagonal neighbors. The left figure shows the original data and the right figure the clustered result, where each color represent a separated cluster.} +\label{fig:exdat1} +\end{figure} + + +If we want to use the matrix based version we have first to convert the data into a matrix. + +<<>>= +#initiate empty matrix +exampledata.pop <- matrix(0, nrow=max(exampledata$x), ncol=max(exampledata$y)) + +#restructure data +for(i in 1:NROW(exampledata)){ + exampledata.pop[exampledata$x[i],exampledata$y[i]] <- exampledata$z[i] +} +@ + +Afterwards we can call the cca again for the cluster distance s = 1. + +<<>>= +example.result <- cca(exampledata.pop, s=1) +str(example.result) +@ + +The result, example.result, is a matrix that defines for each cell to which cluster it belongs (illustrated by figure \ref{fig:exdat2}). In this example we used the default value 3 for mode, but mode = 1 will have the same effect. However, if mode = 2 all eight neighboring cells would be considered, which would lead to one single cluster. + +<>= + pdf(file="pics/exdat2.pdf", paper="special", width=8, height=4) + par(mfrow=c(1,2)) + image(x=1:20, y=1:20,exampledata.pop, xlab="", ylab="") + image(x=1:20, y=1:20,example.result, col=c("white",rep(rainbow(12),2)), xlab="", ylab="") + dev.off() +@ + + +\begin{figure} +\centering +\includegraphics{pics/exdat2.pdf} +\caption{Clustering based on a matrix with a cluster distance equal to the pixel width. The result is exactly the same as the previous result shown in figure \ref{fig:exdat1}.} +\label{fig:exdat2} +\end{figure} + + + +\section{Clustering of geo-referenced data} + +The CCA supports native the raster format, but will do the geo-referenced clustering based on points. To do so the cca converts automatically the raster to a data-frame with two columns, namely long and lat coordinates of the center points from the chosen raster cells. For the use of the orthodromic distance, option unit="m", a WGS84 Latitude/Longitude projection of the data is mandatory. The algorithm compares the distances from every cell to each other. If the distance is smaller than the threshold of the cluster-distance the cells will be considered as one cluster. + +Let's do a short example: First we will create a raster and then cluster the cells with value one for three different cluster distances 150, 240 and 111 km. + +<<>>= +# create a raster and set the projection information +raster <- raster(extent(0,5,0,5),nrow=5,ncol=5) +raster[c(1,2,3,5,6,10,17,18,22,23,24)] <- 1 +proj4string(raster) <- CRS("+proj=longlat") +# get a feeling for the dimensions +summary(distance(raster)[]) +@ + +<>= +# cluster all cells with value 1 +# for various cluster distances of 150, 240 and 111 km +cluster <- cca(raster, cell.class=1, s=1.5e+05, unit="m") +cluster2 <- cca(raster, cell.class=1, s=2.4e+05, unit="m") +cluster3 <- cca(raster, cell.class=1, s=1.11e+05, unit="m") +@ + + +Notice the different results, to be more precise the different number of clusters, for the different cluster distances. For the first run with a cluster distance of 150 km where three clusters identified, see figure \ref{fig:raster} left. The distance is large enough to identify all neighbors as part of the cluster. Through an increase to 240 km the upper left cluster and the upper right become one cluster, see figure \ref{fig:raster} middle. The distance is large enough to bridge a horizontal gap of one empty cell, but not to bridge a diagonal gap. This can be reached by a further increase up to 250 km. +A cluster distance smaller than 110 km will result into 11 clusters, every single cell. A special case can be created by choosing a cluster distance of 111 km, see figure \ref{fig:raster} right. In this case the three upper left cells forming one cluster, where as all other cells are separated. This is because of the cell width, which is lower in high latitudes. +It is also possible to cluster a raster pixelwise, without consideration of the projection. +<>= +pixel <- cca(raster, cell.class=1, s=1) +@ +<<>>= +str(pixel) +@ +For a clusterdistance equal to the resolution of the raster this will lead to the same result as the in section \ref{sec:first} presented ways or the clump-function from the raster package [2]. The second entry of pixel, the cluster size, is then simple the number of cells for each cluster. + + +<>= +pdf("pics/raster.pdf", width=9, height=3) +par(mfrow=c(1,3)) +raster[cellFromXY(raster,cluster$cluster[,1:2])] <- cluster$cluster[,3] +plot(raster,col=rainbow(11),legend=FALSE, main="s = 150 km") +raster[cellFromXY(raster,cluster2$cluster[,1:2])] <- cluster2$cluster[,3] +plot(raster,col=rainbow(11),legend=FALSE, main="s = 240 km") +raster[cellFromXY(raster,cluster3$cluster[,1:2])] <- cluster3$cluster[,3] +plot(raster,col=rainbow(11),legend=FALSE, main="s = 111 km") +dev.off() +@ + +\begin{figure} +\centering +\includegraphics[width=\textwidth]{pics/raster.pdf} +\caption{Clustering of a geo-referenced raster object for several cluster distances. Each colour identifies a cluster. The number of clusters increases with a decrease of the cluster distance.} +\label{fig:raster} +\end{figure} + +\subsection{City Clustering} +Another more realistic example for a fictional land-cover data with five different classes, see figure \ref{fig:landcover}. We chose the cell class 1, a cluster distance of 2 km and the unit="m". + +<>= +data("landcover") +cities <- cca(landcover, cell.class=1, s=2000, unit="m") +@ + +The result, cities, is again a list. However, you can create a raster-object from the results based on the original raster within two lines. + +<>= +str(cities) +result <- landcover*NA +result[cellFromXY(result, cities$cluster[,1:2])] <- cities$cluster[,3] +@ + +We took the coordinates of the clustered cells to compute the cellnumber and replaced these cells with the corosponding cluster id. The result is shown in figure \ref{fig:landcover}. + +<>= +pdf("pics/landcover.pdf", width=8, height=6) +par(mfrow=c(1,2)) +cols <- c("lightblue", "darkred","yellow","orange","green","darkgreen") +plot(landcover, col=cols) +plot(result, col=rainbow(9)) +dev.off() +@ + +\begin{figure} +\centering +\includegraphics[width=\textwidth]{pics/landcover.pdf} +\caption{Clustering of land cover data. Left side the original data and on the right side the clutered result.} +\label{fig:landcover} +\end{figure} + + +\section{Comparison of the two Versions - matrix vs. list} +\label{sec:compare} + +As we have shown already in section \ref{sec:first} the two implemented versions do exactly the same just in different ways. If your data is already in one of the known types (matrix, data.frame, raster) it is logical to use the corresponding implementation of the algorithm. However, there are huge differences in run time and memory consumption which could give you a hint for the right choice. + +The memory use of the matrix-based version depends only on the size of the matrix / raster. As for the algorithm two matrices are needed, consequently the memory use will be doubled. For the list-based version the memory needed depends on the number of cells which are occupied. Each of these cells will be saved in a data-frame with xy-coordinates and a placeholder for the cluster id. + +To summarize it: The list-version is faster for sparse matrices and large cluster-distances, where as the matrix version is better for dense matrices and small cluster-distances. In general the list version needs less memory. + +<>= +library(osc) +data("population") + +tl <- rep(NA,100) +tm <- rep(NA,100) + +for(i in 1:100){ + print(i) + tm[i] <- system.time(mat <- cca(population, s=i, mode=3))[[1]] +} + +index <- which(population[]>0) +long <- ceiling(index/nrow(population)) +lat <- index - ((long-1)*nrow(population)) +popl <- data.frame(long, lat, cluster_id=rep(0, length(lat))) + +for(i in 1:100){ + print(i) + tl[i] <- system.time(list <- cca(popl,s=i) )[[1]] +} + +saveRDS(tm, "tm.rds") +saveRDS(tm, "tl.rds") + +pdf("compare.pdf", width=5, height=4) +plot(tm, xlab="cluster distance", ylab="time in s", col="darkblue", log="", type="l") +lines(tl, col="darkred") +legend("topleft", legend=c("matrix","list"), col=c("darkblue","darkred"), pch="-") +dev.off() +@ + + +\begin{figure} +\centering +\includegraphics{compare.pdf} +\caption{Comparison of the run time for matrix and list based implementation of the cca.} +\label{fig:compare} +\end{figure} + +\newpage +\begin{thebibliography}{9} +\bibitem{Rozenfeld08} +Rozenfeld, H. D., Rybski, D., Andrade, J. S., Batty, M., Stanley, H. E., \& Makse, H. a. (2008). Laws of population growth. Proceedings of the National Academy of Sciences of the United States of America, 105(48), 18702–7. doi:10.1073/pnas.0807435105 +\bibitem{raster14} +Robert J. Hijmans (2014). raster: raster: Geographic data analysis and modeling. R package version 2.2-31. + http://CRAN.R-project.org/package=raster +\end{thebibliography} + +\end{document} diff --git a/man/assign.data.Rd b/man/assign.data.Rd new file mode 100644 index 0000000..5437710 --- /dev/null +++ b/man/assign.data.Rd @@ -0,0 +1,53 @@ +\name{assign.data} +\alias{assign.data} +\title{ +Assign data to clusters +} +\description{ +After clustering assign additional data from a data frame with columns indicated as latitude and longitude. +} +\usage{ +assign.data(cluster, points, dist=1000) +} +\arguments{ + \item{cluster}{ + The from cca() generated data frame with cluster-information. +} + \item{points}{ + Data frame with additional data, containing at least a "lat" and "long" column with point coordinates which will be assigned. +} + \item{dist}{ + The assignment distance given in meters. Are the given point coordinates within this distance from an identified cluster, this point will be assigned to the cluster. +} +} +\details{ +Multiple points can be assigned to the same cluster. If no cluster is within the given distance, the cluster_id will be 0. +} +\value{ +Returns the data frame given as points with an additional column "cluster_id" referring to the identified cluster. A cluster_id 0 indicates that no cluster was within the given distance. +} + +\examples{ +data(landcover) + +# clustering urban areas +urban <- cca(landcover, cell.class=1,s=2000, unit="m") +str(urban) + +# plot the result +result <- landcover*NA +result[cellFromXY(result,urban$cluster[,c("long","lat")])]<-urban$cluster[,"cluster_id"]*(-1) +plot(result, col=rainbow(9)) + +# data.frame with additional information (name) +data.points <- data.frame( + long=c(13.26,13.28), + lat=c(52.34,52.20), + name=c("Pappelhausen","New Garden") + ) + +points(data.points$long, data.points$lat, pch="X") + +assign.data(cluster=urban$cluster, points=data.points, dist=3000) +} +\keyword{utils} diff --git a/man/cca.Rd b/man/cca.Rd new file mode 100644 index 0000000..bf0afef --- /dev/null +++ b/man/cca.Rd @@ -0,0 +1,140 @@ +\name{cca} +\alias{cca} +\alias{cca.single} +\title{ +City Clustering Algorithm (CCA) +} +\description{ +The City Clustering Algorithm (CCA) is based on the burning algorithm [1] and +was introduced in the context of cities in [2]: +CCA is initialized by selecting an arbitrary populated cell which is burnt. +Then, the populated neighbors are also burnt. The algorithm keeps growing +the cluster by iteratively burning neighbors of the burnt cells until there +are no further populated neighboring cells. Next, another +unburned populated cell is picked and the procedure is repeated until all +populated cells are assigned to a cluster. +} +\usage{ +cca(data, s=1, mode=3, count.cells=FALSE, + count.max=ncol(data)*3, + res.x=NULL, res.y=NULL, cell.class=1, + unit="", compare="") +cca.single(data, s, x,y, mode = 3) +} +\arguments{ + \item{data}{ + data to be clustered. This can be a raster, a matrix or a data.frame. See details. +} + \item{s}{ + The radius/shell size of the burning procedure +(i.e. how tolerant to small gaps the algorithm is). + In number of cells if data is a numeric matrix. + In meters if data is a raster or data.frame. +} + \item{x}{ + The starting position in x direction. +} + \item{y}{ + The starting position in y direction +} + \item{mode}{ +The algorithm for a non-georeferenced matrix comes in three versions which affect which close cells are +included to the considered cluster: +(mode=1) nearest neighbors +(mode=2) cells within a shell (i.e. squares of certain size) +(mode=3) cells within a radius +Whereas (mode=1) is equivalent to (mode=3) with r=1 and +(mode=2) with r=1 is equivalent to (mode=3) with r=2. + Only used if data is a numeric matrix. +} + \item{count.cells}{ + Set this option TRUE, if you want know the number of cells which belongs to each cluster. + Only used if data is a numeric matrix. +} + \item{count.max}{ + This defines the maximum number of clusters, It is set per default to ncol*3. + Only used if data is a numeric matrix. +} + \item{res.y}{ + The resolution of the data-set, expressed as distance between to cell centers in degrees (geographical coordinate system). + Not needed if data is a numeric matrix. +} + \item{res.x}{ + As res.x. Only needed if data is a data.frame. +} + \item{cell.class}{ + Only required if data is a raster. Specify which cell class (eg. land use type) will be clustered. Can be an integer or a vector and can be combined with the compare option. +} + \item{unit}{ + If unit = "m" (meter) the cluster algorithm will be done for a cluster distance in meter. Also the cluster size will be in square kilometers. Otherwise the clustering is done in the degrees. If you want to do a pixel-wise clustering, then choose the resolution as cluster distance. +} + \item{compare}{ + If compare = "g" then cells greater than the given cell.class will be chosen. If compare = "s" cells smaller then the cell.class will be chosen. +} + +} +\details{ +cca is implemented in two versions, depending on the format of the data. For numerical matrices, a matrix based version is called. For raster and data.frame based data, a list based version is used, which is faster for sparse matrices and large cluster distances. (See vignette, section: Comparison - matrix vs list) + +For matrix: + +The matrix is a simple numerical matrix. A value equal 0 or smaller is treated as unimportant cell, a value above 0 is treated as cell of interest. Clusters of connected cells are identified. + +For raster: + +A sub-function will be called to extract the coordinates of a given cell type (cell.class). Also a threshold determining which cells can be burnt is possible by using compare = "g" (eg. minimum population to consider a cell as populated) Following steps; see data.frame. + + +For data.frame: + +A data frame with two columns specifying the longitude and latitude coordinate. The algorithm identifies all points with a distance to each other smaller than the cluster distance s. If unit="m" the orthodromic distance, otherwise the euclidean distance will be used. + + +} +\value{ +For matrix: + +Matrix that defines for each cell to which cluster it belongs. + +For raster / data.frame: + +List with two entries - 1. data frame with longitude and latitude coordinates of the cells and the cluster_id. and 2. a vector giving the size of the cluster. First number is the size of the cluster with cluster_id 1, second the size of cluster with cluster_id 2, and so on. + +} +\references{ +[1] Stauffer D (1984) Introduction to Percolation Theory (Taylor & Francis, London). + +[2] Rozenfeld HD, et al. (2008) Laws of population growth. Proc Nat Acad Sci USA 105:18702-18707. +} +\author{ +Steffen Kriewald, Till Fluschnik, Dominik Reusser, Diego Rybski +} + + +\examples{ + +# for a matrix +data(population) +image(population) + +clusters <- cca(population, s=5) +cols <- c("white",rep(rainbow(10), length.out=length(table(clusters))) ) +image(clusters, col=cols, xlab="", ylab="") + +one.cluster <- cca.single(population, s=1, x=125, y=125) +image(one.cluster, col=cols, xlab="", ylab="") + +# for a raster-object +data(landcover) + +# clustering urban areas +urban <- cca(landcover, cell.class=1,s=2000, unit="m") +str(urban) + +# plot the result +result <- landcover*NA +result[cellFromXY(result,urban$cluster[,c("long","lat")])]<-urban$cluster[,"cluster_id"]*(-1) +plot(result, col=rainbow(9)) + +} +\keyword{ utils } diff --git a/man/coordinate.list.Rd b/man/coordinate.list.Rd new file mode 100644 index 0000000..8370f32 --- /dev/null +++ b/man/coordinate.list.Rd @@ -0,0 +1,37 @@ +\name{coordinate.list} +\alias{coordinate.list} +\title{ +List of coordinates for clustering +} +\description{ +Extracts coordinates of cells with defined cell class from a raster object. +} +\usage{ +coordinate.list(raster, cell.class, compare = "") +} +\arguments{ + \item{raster}{ + raster with values +} + \item{cell.class}{ + number or vector of cell-values for clustering +} + \item{compare}{ + character of type "", "g" or "s". If "g" or "s" all coordinates of cells with value greater "g" resp. smaller "s" than cell.class will be extracted +} +} + +\details{ + Works also for very large raster, but can take some time. +} +\value{ + Returns a data frame with lat-, long-coordinates +} + +\examples{ +data("landcover") + +coordinate.list(landcover, 1:10) + +} +\keyword{utils} diff --git a/man/exampledata.Rd b/man/exampledata.Rd new file mode 100644 index 0000000..c958003 --- /dev/null +++ b/man/exampledata.Rd @@ -0,0 +1,24 @@ +\name{exampledata} +\alias{exampledata} +\docType{data} +\title{ +Example data for the clustering algorithm. +} +\description{ +This is test data for the package. +} +\usage{data(exampledata)} +\format{ + A data frame with 235 observations on the following 4 variables. + \describe{ + \item{\code{x}}{a numeric vector of x-coordinates} + \item{\code{y}}{a numeric vector of y-coordinates} + \item{\code{z}}{a numeric vector of population data} + \item{\code{cluster}}{a numeric vector of clusters} + } +} +\examples{ +data(exampledata) +str(exampledata) +} +\keyword{datasets} diff --git a/man/landcover.Rd b/man/landcover.Rd new file mode 100644 index 0000000..8362e43 --- /dev/null +++ b/man/landcover.Rd @@ -0,0 +1,19 @@ +\name{landcover} +\alias{landcover} +\docType{data} +\title{ +Fictional landcover data to demonstrate the cca for Raster-Data +} +\description{ +A fictional landcover dataset with six different landtypes indicated by number from 0 to 5. Cells with value one a considered as urban in the examples. +} +\usage{data(landcover)} +\format{ + The format is: +Formal class 'RasterLayer' [package "raster"] with 12 slots +} +\examples{ +data(landcover) +str(landcover) +} +\keyword{datasets} diff --git a/man/osc.buffer.Rd b/man/osc.buffer.Rd new file mode 100644 index 0000000..8128169 --- /dev/null +++ b/man/osc.buffer.Rd @@ -0,0 +1,30 @@ +\name{osc.buffer} +\alias{osc.buffer} +\title{ +Simple Buffer algorithm +} +\description{ +Simple buffer based on euclidean distance are created around all cells equal to one. +} +\usage{ +osc.buffer(input, width) +} +\arguments{ + \item{input}{ + Matrix or Raster containing 1 indicating a cluster, no NA values are allowed +} + \item{width}{ + Width of the buffer in cells +} +} + +\value{ + Returns matrix or raster, depending on input, with 1 for cluster and -1 for surrounding buffer +} + +\examples{ +data(landcover) +landcover[landcover[]>1] <- 0 +plot(osc.buffer(landcover, 4)) +} +\keyword{utils} diff --git a/man/population.Rd b/man/population.Rd new file mode 100644 index 0000000..71e49ab --- /dev/null +++ b/man/population.Rd @@ -0,0 +1,19 @@ +\name{population} +\alias{population} +\docType{data} +\title{ +Example population data for the city clustering algorithm +} +\description{ +Example population data for the city clustering algorithm +} +\usage{data(population)} +\format{ + The format is: + num [1:1525, 1:1000] 0 0 0 0 0 0 0 0 0 0 ... +} +\examples{ +data(population) +str(population) +} +\keyword{datasets} diff --git a/src/cca-core.c b/src/cca-core.c new file mode 100644 index 0000000..0f9224e --- /dev/null +++ b/src/cca-core.c @@ -0,0 +1,291 @@ + +#include +#include +#include + +//Required +void getrow(int *x, int *xmax, int *ymax, int *data, int *ret) { + + int y; + for(y=0; y<= *ymax; y++){ + ret[y] = data[*x * *ymax + y]; + } +} +void getcol(int *y, int *xmax, int *ymax, int *data, int *ret) { + int x; + for(x=0; x<= *xmax; x++){ + ret[x] = data[x * *ymax + *y]; + } +} +void getblock(int *y, int *x, int *dy, int *dx, int *xmax, int *ymax, int *data, int *ret) { + int e, f; + for(e=0; e< *dx; e++){ + for(f=0; f< *dy; f++){ + ret[f * *dy + e] = data[f * *ymax + e]; + } + } +} + + +void burnn(int *x,int *y,int *c, int *xmax, int *ymax, int *data,int *clu) { + // burn nearest neigbors + //Rprintf("xycrxmaxymax\t%i\t%i\t%i\t%i\t%i\t%i\n",*x,*y,*c,1,*xmax,*ymax); + int d, e,f,g; + d=*x; // steps to the left + while(d>=0 && data[d * *ymax + *y]>0) { + clu[d * *ymax + *y]=*c; + d--; + + } + e=*x+1; // steps to the right + while(e<*xmax&&data[e * *ymax + *y]>0) { + clu[e * *ymax + *y]=*c; + e++; + } + //Rprintf(" de data\t%i\t%i\t", d,e); + for(f=*y+1;f>=*y-1;f=f-2){// one step up or down + if(f<*ymax && f>=0){ + for(g=d+1; g0) { + burnn(&g,&f,c,xmax,ymax,data,clu); + } + } + } + } +} + +void burns(int *data,int *clu,int *x,int *y,int *c,int *s, int *xmax, int *ymax) { + // burn s shells + //Rprintf("xycrxmaxymax\t%i\t%i\t%i\t%i\t%i\t%i\n",*x,*y,*c,*s,*xmax,*ymax); + long a,b; + int dx,dy; + int d, e,g; + d=*x; // steps to the left + while(d>=0 && clu[d* *ymax + *y]==0 && data[d* *ymax + *y]>0) { + clu[d * *ymax + *y]=*c; + d--; + + } + e=*x+1; // steps to the right + while(e<*xmax&&clu[e * *ymax + *y]==0&&data[e * *ymax + *y]>0) { + clu[e * *ymax + *y]=*c; + e++; + } + for(g=d+1; g=0&&dx<*xmax) { + for(b=-*s;b<= *s;b++) { + dy=*y+b; + //Rprintf("dy\t%i\n",dy); + if(dy>=0&&dy<*ymax) { + //Rprintf("clu[%i,%i], data[%i,%i]\t%i\t%i\n",dx, dy, dx,dy,clu[dx * *ymax +dy],data[dx * *ymax +dy]); + if(clu[dx * *ymax +dy]==0&&data[dx * *ymax +dy]>0) { + burns(data,clu,&dx,&dy,c,s,xmax,ymax); + } + } + } + } + } + } +} + +void burnr(int *data,int *clu,int *x,int *y,int *c,int *r, int *xmax, int *ymax) { + // burn with radius r + // r=1 should correspond burnn + long a,b; + int dx,dy; + double rd; + int d, e,g; + d=*x; // steps to the left + while(d>=0 && clu[d* *ymax + *y]==0 && data[d* *ymax + *y]>0) { + clu[d * *ymax + *y]=*c; + d--; + + } + e=*x+1; // steps to the right + while(e<*xmax&&clu[e * *ymax + *y]==0&&data[e * *ymax + *y]>0) { + clu[e * *ymax + *y]=*c; + e++; + } + for(g=d+1; g=0&&dx<*xmax)&&(dy>=0&&dy<*ymax)) { + if(clu[dx * *ymax + dy]==0&& data[dx * *ymax + dy]>0) { + burnr(data,clu,&dx,&dy,c,r, xmax, ymax); + } + } + } + } + } + } +} + + /* Note: R fills matrices per column, so first index is row, + second is column */ +void callburn(int *s, int *xmax, int *ymax, int *mode, int *data,int *clu) { + int a,b; + int c; + c=0; + for(a=0;a<*xmax;a++) { + for(b=0;b<*ymax;b++) { + if(data[a* *ymax + b]>0&&clu[a* *ymax + b]==0) { + c++; + if(*mode == 1){ + burnn(&a,&b,&c,xmax,ymax,data,clu); // burn nearest + } else if(*mode == 2){ + burns(data,clu,&a,&b,&c,s,xmax,ymax); // burn shells + } else if(*mode == 3){ + burnr(data,clu,&a,&b,&c,s,xmax,ymax); // burn radius + } else { + Rprintf("unknown mode: %d\n", *mode); + } + } + } + } +} + + + + +void burnn_count(int *x,int *y,int *c, int *xmax, int *ymax, int *data,int *clu, int *count) { + // burn nearest neigbors + //Rprintf("xycrxmaxymax\t%i\t%i\t%i\t%i\t%i\t%i\n",*x,*y,*c,1,*xmax,*ymax); + int d, e,f,g; + d=*x; // steps to the left + while(d>=0 && data[d * *ymax + *y]>0) { + clu[d * *ymax + *y]=*c; + count[*c-1]++; + d--; + } + e=*x+1; // steps to the right + while(e<*xmax&&data[e * *ymax + *y]>0) { + clu[e * *ymax + *y]=*c; + count[*c-1]++; + e++; + } + //Rprintf(" de data\t%i\t%i\t", d,e); + for(f=*y+1;f>=*y-1;f=f-2){// one step up or down + if(f<*ymax && f>=0){ + for(g=d+1; g0) { + burnn_count(&g,&f,c,xmax,ymax,data,clu,count); + } + } + } + } +} + +void burns_count(int *data,int *clu,int *x,int *y,int *c,int *s, int *xmax, int *ymax, int *count) { + // burn s shells + //Rprintf("xycrxmaxymax\t%i\t%i\t%i\t%i\t%i\t%i\n",*x,*y,*c,*s,*xmax,*ymax); + long a,b; + int dx,dy; + int d, e,g; + d=*x; // steps to the left + while(d>=0 && clu[d* *ymax + *y]==0 && data[d* *ymax + *y]>0) { + clu[d * *ymax + *y]=*c; + count[*c-1]++; + d--; + + } + e=*x+1; // steps to the right + while(e<*xmax&&clu[e * *ymax + *y]==0&&data[e * *ymax + *y]>0) { + clu[e * *ymax + *y]=*c; + count[*c-1]++; + e++; + } + for(g=d+1; g=0&&dx<*xmax) { + for(b=-*s;b<= *s;b++) { + dy=*y+b; + //Rprintf("dy\t%i\n",dy); + if(dy>=0&&dy<*ymax) { + //Rprintf("clu[%i,%i], data[%i,%i]\t%i\t%i\n",dx, dy, dx,dy,clu[dx * *ymax +dy],data[dx * *ymax +dy]); + if(clu[dx * *ymax +dy]==0&&data[dx * *ymax +dy]>0) { + burns_count(data,clu,&dx,&dy,c,s,xmax,ymax,count); + } + } + } + } + } + } +} + +void burnr_count(int *data,int *clu,int *x,int *y,int *c,int *r, int *xmax, int *ymax, int *count) { + // burn with radius r + // r=1 should correspond burnn + long a,b; + int dx,dy; + double rd; + int d, e,g; + d=*x; // steps to the left + while(d>=0 && clu[d* *ymax + *y]==0 && data[d* *ymax + *y]>0) { + clu[d * *ymax + *y]=*c; + count[*c-1]++; + d--; + + } + e=*x+1; // steps to the right + while(e<*xmax&&clu[e * *ymax + *y]==0&&data[e * *ymax + *y]>0) { + clu[e * *ymax + *y]=*c; + count[*c-1]++; + e++; + } + for(g=d+1; g=0&&dx<*xmax)&&(dy>=0&&dy<*ymax)) { + if(clu[dx * *ymax + dy]==0&& data[dx * *ymax + dy]>0) { + burnr_count(data,clu,&dx,&dy,c,r, xmax, ymax, count); + } + } + } + } + } + } +} + + /* Note: R fills matrices per column, so first index is row, + second is column */ +void callburn_count(int *s, int *xmax, int *ymax, int *mode, int *data,int *clu, int *count, int *countmax) { + int a,b; + int c; + c=0; + for(a=0;a<*xmax;a++) { + for(b=0;b<*ymax;b++) { + if(data[a* *ymax + b]>0&&clu[a* *ymax + b]==0) { + c++; + if(*countmax <= c){ + Rprintf("count.max is too little\n"); + return; + } + if(*mode == 1){ + burnn_count(&a,&b,&c,xmax,ymax,data,clu,count); // burn nearest + } else if(*mode == 2){ + burns_count(data,clu,&a,&b,&c,s,xmax,ymax,count); // burn shells + } else if(*mode == 3){ + burnr_count(data,clu,&a,&b,&c,s,xmax,ymax,count); // burn radius + } else { + Rprintf("unknown mode: %d\n", *mode); + } + } + } + } +} + diff --git a/src/ccaRevolution.c b/src/ccaRevolution.c new file mode 100644 index 0000000..2a26ff6 --- /dev/null +++ b/src/ccaRevolution.c @@ -0,0 +1,194 @@ +#include +#include +#include +#include +#include +#include + +// m is (n x 3)-matrix. The first col includes x-coordinates, the second y-coordinates +// and the third one includes place -holder for the cluster-id +// n is the length of the list (the number of rows of the matrix, to be precise) +// l ist the cluster distance + +void ccaRev(double *m, int *n, double *l, int *w) +{ + w[0] = 0; + int zeros, i, k, j, mm, cc; + zeros = 0; + i=0; + mm=1; + cc=1; + double dist; + + while(zeros<*n){ + k = w[i]; + if(m[2* *n+ k]==0){ + m[2* *n + k] = cc; + zeros++; + } + if(k>0) + for(j=k-1; j>=0 && (m[k]-m[j])<=*l; j--){ + if(m[2* *n + j]==0 && abs(m[*n+k]-m[*n+j])<=*l){ + dist = sqrt(((m[k]-m[j])*(m[k]-m[j]))+((m[*n+k]-m[*n+j])*(m[*n+k]-m[*n+j]))); + if(dist<=(double)*l){ + w[mm] = j; + m[2* *n+j] = cc; + mm++; + zeros++; + } + + } + } + if(k<*n-1) + for(j=k+1; j<*n && (m[j]-m[k])<=*l; j++){ + if(m[2* *n + j]==0 && abs(m[*n+k]-m[*n+j])<=*l){ + dist = sqrt(((m[k]-m[j])*(m[k]-m[j]))+((m[*n+k]-m[*n+j])*(m[*n+k]-m[*n+j]))); + if(dist<=(double)*l){ + w[mm] = j; + m[2* *n+j] = cc; + mm++; + zeros++; + } + } + } + i++; + if(zeros==*n){ break;} + if(w[i]==0){ + cc++; + k = 0; + while(m[2* *n+ k]!=0) k++; + w[i] = k; + mm = i+1; + } + } +} + + + +void ccaSum(double *m, int *m3, double *mm, int *n){ + int i; + for(i=0; i<*n; i++) + mm[m3[i]-1] += 1; +} + +void ccaSumT(double *m, int *m3, double *mm, int *n){ + int i; + for(i=0; i<*n; i++) + mm[m3[i]-1] += cos(m[*n+i]); +} + +void ccaRevT(double *m, int *n, double *l, int *step_w, int *step_h, double *res_x, double *res_y, int *w) +{ + w[0] = 0; + int zeros, i, k, j, mm, cc; + zeros = 0; + i=0; + mm=1; + cc=1; + double dist; + + while(zeros<*n){ + k = w[i]; + if(m[2* *n+ k]==0){ + m[2* *n + k] = cc; + zeros++; + } + if(k>0) + for(j=k-1; j>=0 && abs((m[*n+k]-m[*n+j])/ *res_y)<=*step_h; j--){ + if(m[2* *n + j]==0 && abs((m[k]-m[j])/ *res_x)<=*step_w){ + dist = acos(sin(m[*n+k])*sin(m[*n+j])+cos(m[*n+k])*cos(m[*n+j])*cos(m[k]-m[j]))*6371000; + if(dist<=(double)*l){ + w[mm] = j; + m[2* *n+j] = cc; + mm++; + zeros++; + } + + } + } + if(k<*n-1) + for(j=k+1; j<*n && abs((m[*n+j]-m[*n+k])/ *res_y)<=*step_h; j++){ + if(m[2* *n + j]==0 && abs((m[k]-m[j])/ *res_x)<=*step_w){ + dist = acos(sin(m[*n+k])*sin(m[*n+j])+cos(m[*n+k])*cos(m[*n+j])*cos(m[k]-m[j]))*6371000; + if(dist<=(double)*l){ + w[mm] = j; + m[2* *n+j] = cc; + mm++; + zeros++; + } + } + } + + i++; + if(zeros==*n){ break;} + if(w[i]==0){ + cc++; + k = 0; + while(m[2* *n+ k]!=0) k++; + w[i] = k; + mm = i+1; + } + } +} + +double SCMakse(int *m, int *n, double *theta_0, double *xq, double *l) +{ + int k, j, delt=0; + double dist, zsum=0; + + for(k=0; k<*n; k++){ + + if(k>0) + for(j=k-1; j>=0 && (m[k]-m[j])<=*l; j--){ + if(m[2* *n + j]==0 && abs(m[*n+k]-m[*n+j])<=*l){ + dist = sqrt(((m[k]-m[j])*(m[k]-m[j]))+((m[*n+k]-m[*n+j])*(m[*n+k]-m[*n+j]))); + if(dist==(double)*l){ + delt++; + zsum += (m[3* *n+j]-*xq)*(m[3* *n+k]-*xq); + + } + + } + } + if(k<*n-1) + for(j=k+1; j<*n && (m[j]-m[k])<=*l; j++){ + if(m[2* *n + j]==0 && abs(m[*n+k]-m[*n+j])<=*l){ + dist = sqrt(((m[k]-m[j])*(m[k]-m[j]))+((m[*n+k]-m[*n+j])*(m[*n+k]-m[*n+j]))); + if(dist==(double)*l){ + delt++; + zsum += (m[3* *n+j]-*xq)*(m[3* *n+k]-*xq); + } + } + } + } + return (zsum/(delt* *theta_0)); +} + + + +int min(int a, int b) +{ + if(a<=b) return(a); + else return(b); +} + +int max(int a, int b) +{ + if(a>b) return(a); + else return(b); +} + +void ccaBuffED(int *m, int *nr, int *nc, int *sz) +{ + int i,j,k,l,s=*sz; + for(i=0; i<*nc; i++){ + for(j=0; j<*nr; j++){ + if(m[i**nr+j]==1) + for(k=max(0,i-s); k<=min(*nc-1,i+s); k++) + for(l=max(0,j-s); l<=min(*nr-1,j+s); l++) + if(sqrt(((k-i)*(k-i))+((l-j)*(l-j)))<=s && m[k**nr+l]==0) + m[k**nr+l] = -1; + } + } +} + diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..5abc76b --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,2 @@ +library(testthat) +test_check('osc') \ No newline at end of file diff --git a/tests/testthat/test_cca_multi.R b/tests/testthat/test_cca_multi.R new file mode 100644 index 0000000..e570988 --- /dev/null +++ b/tests/testthat/test_cca_multi.R @@ -0,0 +1,12 @@ +context("CCA Multi") + +test_that("cca works for population", { + data(population) + expect_equivalent(length(table(cca(population, s=1, mode=1))), 429) +}) + +test_that("cca works for population mode 3", { + data(population) + expect_equivalent(length(table(cca(population, s=10, mode=3))), 15) +}) + diff --git a/tests/testthat/test_cca_single.R b/tests/testthat/test_cca_single.R new file mode 100644 index 0000000..3df9ae6 --- /dev/null +++ b/tests/testthat/test_cca_single.R @@ -0,0 +1,34 @@ +context("CCA Single") +test_that("cca single works for full matrix", { + expect_that(cca.single(matrix(1:12,nrow=3), s=1, x=3,y=1,mode=1), equals(matrix(rep(1,12),nrow=3))) + expect_that(cca.single(matrix(1:12,nrow=3), s=1, x=3,y=1,mode=2), equals(matrix(rep(1,12),nrow=3))) + expect_that(cca.single(matrix(1:12,nrow=3), s=1, x=3,y=1,mode=3), equals(matrix(rep(1,12),nrow=3))) + }) +test_that("cca single works for half matrix", { + half.matrix <- matrix(rep(c(1,0),each=6),nrow=3) + expect_that(cca.single(half.matrix, s=1, x=1,y=3,mode=1), equals(matrix(rep(0,12),nrow=3))) + expect_that(cca.single(half.matrix, s=1, x=1,y=2,mode=1), equals(half.matrix)) + }) + +test_that("cca single works differently for nearest neighbors (mode=1), shell (mode=2) and radius (mode=3) ", { + complex.matrix <- matrix(rep(0,60),nrow=12) + complex.matrix[1,1] <- 1 + corner.only <- complex.matrix + complex.matrix[2,2] <- 1 #not reached by nearest neighbor, but by shell of size 1 + two.areas <- complex.matrix + complex.matrix[4,4] <- 1 #not reached by radius 2 neighborhood, but by shell of size 2 + expect_that(cca.single(complex.matrix, s=1, x=1,y=1,mode=1), equals(corner.only)) + expect_that(cca.single(complex.matrix, s=1, x=1,y=1,mode=2), equals(two.areas)) + expect_that(cca.single(complex.matrix, s=2, x=1,y=1,mode=3), equals(two.areas)) + expect_that(cca.single(complex.matrix, s=2, x=1,y=1,mode=2), equals(complex.matrix)) + }) +test_that("cca single works equivalent for nearest neighbors (mode=1) and radius with s=1 (mode=3) ", { + complex.matrix <- matrix(rep(0,60),nrow=12) + complex.matrix[1,1] <- 1 + corner.only <- complex.matrix + complex.matrix[2,2] <- 1 #not reached by nearest neighbor, but by shell of size 1 + two.areas <- complex.matrix + complex.matrix[4,4] <- 1 #not reached by radius 2 neighborhood, but by shell of size 2 + expect_that(cca.single(complex.matrix, s=1, x=1,y=1,mode=3), equals(corner.only)) + }) + diff --git a/tests/testthat/test_coordinate_list.R b/tests/testthat/test_coordinate_list.R new file mode 100644 index 0000000..649e226 --- /dev/null +++ b/tests/testthat/test_coordinate_list.R @@ -0,0 +1,14 @@ +context("ccaR with coordinate list from RasterLayer") + +test_that("ccaR works with cordinate list for landcover", { + data(landcover) + test <- cca(landcover, cell.class=1,s=0.01) + expect_equivalent(length(test$size), 20) + expect_equivalent(table(test$cluster[,3])[1], 2) + test2 <- cca(landcover, s=2*res(landcover)[1]-0.00001, cell.class=c(2), compare="s") + expect_equivalent(length(test2$size), 60) + expect_equivalent(table(test2$cluster[,3])[1], 41) + test3 <- cca(landcover, s=2*res(landcover)[1]-0.00001, cell.class=c(0,1), compare="") + expect_equivalent(length(test3$size), 60) + expect_equivalent(table(test3$cluster[,3])[1], 41) +}) diff --git a/vignettes/compare.pdf b/vignettes/compare.pdf new file mode 100644 index 0000000..8743a8d Binary files /dev/null and b/vignettes/compare.pdf differ diff --git a/vignettes/paper.rnw b/vignettes/paper.rnw new file mode 100644 index 0000000..42de8df --- /dev/null +++ b/vignettes/paper.rnw @@ -0,0 +1,254 @@ +\documentclass[10pt,a4paper]{article} + +% Umlaute und neue deutsche Rechtschreibung +\usepackage[english]{babel} +% Kodierung (meistens utf8: OS X, Linux; latin1: Windows) +\usepackage[utf8]{inputenc} +% mathematische Symbole und Umgebungen +\usepackage{amsmath,amsfonts,amssymb} +\usepackage{graphicx} +% Sweave.sty angeben, gegebenfalls mit Pfad +\usepackage{Sweave} +\SweaveOpts{prefix.string=pics/fig} +% \VignetteIndexEntry{Using the City Clustering Algorithm} +% \VignetteKeywords{R, raster} +% \VignettePackage{cca} + + +\begin{document} +\SweaveOpts{concordance=TRUE} + +\title{Introduction to the City Clustering Algorithm} +\author{Steffen Kriewald} +\date{\today} +\maketitle +\tableofcontents + +\section{Introduction} + +This vignette describes first steps with the R package of the City Clustering Algorithm (CCA). CCA allows to cluster a specific value in a 2-dimensional data-set. This algorithm was originally used to identify cities based on clustered population- or land-cover-data, but can be applied in multiple cases. It was also used to identify hydrological connected areas based on digital elevation models. + +The clustering algorithm based on the burning algorithm presented in Rozenfeld et al.[1] and is implemented in two versions: a matrix based and a list based. The differences in run time and memory use will be discussed in section \ref{sec:compare}. The list-based algorithm can handle geo-referenced data and offers full integration of raster objects. + +For the problem of cities the method in general can be described as following: The CCA selects an urban cell and burns it in a first step. "burns" means the urban cell will be marked and then belongs to a specific cluster. Then the cca starts an iterative burning of all neighboring cells until all neighboring cells are non-urban cells. Another possibility is to allow small gaps between cells, by introducing a cluster distance. These small gaps are important as cities can be divided by rivers or through data processing. + +\section{First clustering} +\label{sec:first} + +We will start with a small example, using the exampledata. The data frame contains two columns representing the rows and columns, namely x and y. We will cluster the given points with a cluster distance s = 1. Here a cluster distance of s = 1 is equal to the pixel width. +<<>>= +library(osc) +data(exampledata) +str(exampledata) +pop.list <- cca(exampledata[,1:2],s=1) +str(pop.list) +@ + +In this example the cluster distance s=1 is equal to rook’s case and consequently only four neighboring cells will be joined to the cluster and no diagonal neighbors, see figure \ref{fig:exdat1}. The result, pop.list, is a list with two entries. The first contains a data frame with the original coordinates followed by a column representing the cluster by an identification number (id). The second is a a vector giving the size of the cluster. First number is the size of the cluster with cluster id = 1, second the size of cluster with cluster id = 2, and so on. + +<>= + palette(rainbow(12)) + pdf(file="pics/exdat1.pdf", paper="special", width=8, height=4) + par(mfrow=c(1,2)) + plot(exampledata$x,exampledata$y,col="darkblue", pch=15, xlab="", ylab="", cex=1.2) + plot(pop.list$cluster$long,pop.list$cluster$lat,col=pop.list$cluster$cluster_id, pch=15, xlab="", ylab="", cex=1.2) + dev.off() + palette("default") +@ + +\begin{figure} +\centering +\includegraphics{pics/exdat1.pdf} +\caption{Clustering based on a data frame with a cluster distance equal to the pixel width. Only the four directly connected neighboring cells are joined to a cluster and no diagonal neighbors. The left figure shows the original data and the right figure the clustered result, where each color represent a separated cluster.} +\label{fig:exdat1} +\end{figure} + + +If we want to use the matrix based version we have first to convert the data into a matrix. + +<<>>= +#initiate empty matrix +exampledata.pop <- matrix(0, nrow=max(exampledata$x), ncol=max(exampledata$y)) + +#restructure data +for(i in 1:NROW(exampledata)){ + exampledata.pop[exampledata$x[i],exampledata$y[i]] <- exampledata$z[i] +} +@ + +Afterwards we can call the cca again for the cluster distance s = 1. + +<<>>= +example.result <- cca(exampledata.pop, s=1) +str(example.result) +@ + +The result, example.result, is a matrix that defines for each cell to which cluster it belongs (illustrated by figure \ref{fig:exdat2}). In this example we used the default value 3 for mode, but mode = 1 will have the same effect. However, if mode = 2 all eight neighboring cells would be considered, which would lead to one single cluster. + +<>= + pdf(file="pics/exdat2.pdf", paper="special", width=8, height=4) + par(mfrow=c(1,2)) + image(x=1:20, y=1:20,exampledata.pop, xlab="", ylab="") + image(x=1:20, y=1:20,example.result, col=c("white",rep(rainbow(12),2)), xlab="", ylab="") + dev.off() +@ + + +\begin{figure} +\centering +\includegraphics{pics/exdat2.pdf} +\caption{Clustering based on a matrix with a cluster distance equal to the pixel width. The result is exactly the same as the previous result shown in figure \ref{fig:exdat1}.} +\label{fig:exdat2} +\end{figure} + + + +\section{Clustering of geo-referenced data} + +The CCA supports native the raster format, but will do the geo-referenced clustering based on points. To do so the cca converts automatically the raster to a data-frame with two columns, namely long and lat coordinates of the center points from the chosen raster cells. For the use of the orthodromic distance, option unit="m", a WGS84 Latitude/Longitude projection of the data is mandatory. The algorithm compares the distances from every cell to each other. If the distance is smaller than the threshold of the cluster-distance the cells will be considered as one cluster. + +Let's do a short example: First we will create a raster and then cluster the cells with value one for three different cluster distances 150, 240 and 111 km. + +<<>>= +# create a raster and set the projection information +raster <- raster(extent(0,5,0,5),nrow=5,ncol=5) +raster[c(1,2,3,5,6,10,17,18,22,23,24)] <- 1 +proj4string(raster) <- CRS("+proj=longlat") +# get a feeling for the dimensions +summary(distance(raster)[]) +@ + +<>= +# cluster all cells with value 1 +# for various cluster distances of 150, 240 and 111 km +cluster <- cca(raster, cell.class=1, s=1.5e+05, unit="m") +cluster2 <- cca(raster, cell.class=1, s=2.4e+05, unit="m") +cluster3 <- cca(raster, cell.class=1, s=1.11e+05, unit="m") +@ + + +Notice the different results, to be more precise the different number of clusters, for the different cluster distances. For the first run with a cluster distance of 150 km where three clusters identified, see figure \ref{fig:raster} left. The distance is large enough to identify all neighbors as part of the cluster. Through an increase to 240 km the upper left cluster and the upper right become one cluster, see figure \ref{fig:raster} middle. The distance is large enough to bridge a horizontal gap of one empty cell, but not to bridge a diagonal gap. This can be reached by a further increase up to 250 km. +A cluster distance smaller than 110 km will result into 11 clusters, every single cell. A special case can be created by choosing a cluster distance of 111 km, see figure \ref{fig:raster} right. In this case the three upper left cells forming one cluster, where as all other cells are separated. This is because of the cell width, which is lower in high latitudes. +It is also possible to cluster a raster pixelwise, without consideration of the projection. +<>= +pixel <- cca(raster, cell.class=1, s=1) +@ +<<>>= +str(pixel) +@ +For a clusterdistance equal to the resolution of the raster this will lead to the same result as the in section \ref{sec:first} presented ways or the clump-function from the raster package [2]. The second entry of pixel, the cluster size, is then simple the number of cells for each cluster. + + +<>= +pdf("pics/raster.pdf", width=9, height=3) +par(mfrow=c(1,3)) +raster[cellFromXY(raster,cluster$cluster[,1:2])] <- cluster$cluster[,3] +plot(raster,col=rainbow(11),legend=FALSE, main="s = 150 km") +raster[cellFromXY(raster,cluster2$cluster[,1:2])] <- cluster2$cluster[,3] +plot(raster,col=rainbow(11),legend=FALSE, main="s = 240 km") +raster[cellFromXY(raster,cluster3$cluster[,1:2])] <- cluster3$cluster[,3] +plot(raster,col=rainbow(11),legend=FALSE, main="s = 111 km") +dev.off() +@ + +\begin{figure} +\centering +\includegraphics[width=\textwidth]{pics/raster.pdf} +\caption{Clustering of a geo-referenced raster object for several cluster distances. Each colour identifies a cluster. The number of clusters increases with a decrease of the cluster distance.} +\label{fig:raster} +\end{figure} + +\subsection{City Clustering} +Another more realistic example for a fictional land-cover data with five different classes, see figure \ref{fig:landcover}. We chose the cell class 1, a cluster distance of 2 km and the unit="m". + +<>= +data("landcover") +cities <- cca(landcover, cell.class=1, s=2000, unit="m") +@ + +The result, cities, is again a list. However, you can create a raster-object from the results based on the original raster within two lines. + +<>= +str(cities) +result <- landcover*NA +result[cellFromXY(result, cities$cluster[,1:2])] <- cities$cluster[,3] +@ + +We took the coordinates of the clustered cells to compute the cellnumber and replaced these cells with the corosponding cluster id. The result is shown in figure \ref{fig:landcover}. + +<>= +pdf("pics/landcover.pdf", width=8, height=6) +par(mfrow=c(1,2)) +cols <- c("lightblue", "darkred","yellow","orange","green","darkgreen") +plot(landcover, col=cols) +plot(result, col=rainbow(9)) +dev.off() +@ + +\begin{figure} +\centering +\includegraphics[width=\textwidth]{pics/landcover.pdf} +\caption{Clustering of land cover data. Left side the original data and on the right side the clutered result.} +\label{fig:landcover} +\end{figure} + + +\section{Comparison of the two Versions - matrix vs. list} +\label{sec:compare} + +As we have shown already in section \ref{sec:first} the two implemented versions do exactly the same just in different ways. If your data is already in one of the known types (matrix, data.frame, raster) it is logical to use the corresponding implementation of the algorithm. However, there are huge differences in run time and memory consumption which could give you a hint for the right choice. + +The memory use of the matrix-based version depends only on the size of the matrix / raster. As for the algorithm two matrices are needed, consequently the memory use will be doubled. For the list-based version the memory needed depends on the number of cells which are occupied. Each of these cells will be saved in a data-frame with xy-coordinates and a placeholder for the cluster id. + +To summarize it: The list-version is faster for sparse matrices and large cluster-distances, where as the matrix version is better for dense matrices and small cluster-distances. In general the list version needs less memory. + +<>= +library(osc) +data("population") + +tl <- rep(NA,100) +tm <- rep(NA,100) + +for(i in 1:100){ + print(i) + tm[i] <- system.time(mat <- cca(population, s=i, mode=3))[[1]] +} + +index <- which(population[]>0) +long <- ceiling(index/nrow(population)) +lat <- index - ((long-1)*nrow(population)) +popl <- data.frame(long, lat, cluster_id=rep(0, length(lat))) + +for(i in 1:100){ + print(i) + tl[i] <- system.time(list <- cca(popl,s=i) )[[1]] +} + +saveRDS(tm, "tm.rds") +saveRDS(tm, "tl.rds") + +pdf("compare.pdf", width=5, height=4) +plot(tm, xlab="cluster distance", ylab="time in s", col="darkblue", log="", type="l") +lines(tl, col="darkred") +legend("topleft", legend=c("matrix","list"), col=c("darkblue","darkred"), pch="-") +dev.off() +@ + + +\begin{figure} +\centering +\includegraphics{compare.pdf} +\caption{Comparison of the run time for matrix and list based implementation of the cca.} +\label{fig:compare} +\end{figure} + +\newpage +\begin{thebibliography}{9} +\bibitem{Rozenfeld08} +Rozenfeld, H. D., Rybski, D., Andrade, J. S., Batty, M., Stanley, H. E., \& Makse, H. a. (2008). Laws of population growth. Proceedings of the National Academy of Sciences of the United States of America, 105(48), 18702–7. doi:10.1073/pnas.0807435105 +\bibitem{raster14} +Robert J. Hijmans (2014). raster: raster: Geographic data analysis and modeling. R package version 2.2-31. + http://CRAN.R-project.org/package=raster +\end{thebibliography} + +\end{document} diff --git a/vignettes/pics/exdat1.pdf b/vignettes/pics/exdat1.pdf new file mode 100644 index 0000000..9d56105 Binary files /dev/null and b/vignettes/pics/exdat1.pdf differ diff --git a/vignettes/pics/exdat2.pdf b/vignettes/pics/exdat2.pdf new file mode 100644 index 0000000..5013d87 Binary files /dev/null and b/vignettes/pics/exdat2.pdf differ diff --git a/vignettes/pics/fig-concordance.tex b/vignettes/pics/fig-concordance.tex new file mode 100644 index 0000000..3112821 --- /dev/null +++ b/vignettes/pics/fig-concordance.tex @@ -0,0 +1,5 @@ +\Sconcordance{concordance:paper.tex:paper.rnw:% +1 37 1 1 2 1 0 2 1 9 0 1 1 8 0 1 1 11 0 1 2 2 1 1 10 10 1 1 3 2 0 1 5 6 % +0 1 2 2 1 1 2 1 0 1 1 6 0 1 2 2 1 1 8 16 1 1 3 2 0 2 1 1 2 8 0 1 2 1 4 % +3 0 2 1 3 0 1 2 4 1 1 2 4 0 1 3 12 0 1 2 2 1 1 12 10 1 1 2 1 0 1 1 3 0 % +1 2 2 1 1 2 11 0 2 1 3 0 1 2 2 1 1 9 17 1 1 32 18 1} diff --git a/vignettes/pics/landcover.pdf b/vignettes/pics/landcover.pdf new file mode 100644 index 0000000..bc0656c Binary files /dev/null and b/vignettes/pics/landcover.pdf differ diff --git a/vignettes/pics/raster.pdf b/vignettes/pics/raster.pdf new file mode 100644 index 0000000..e3bca9e Binary files /dev/null and b/vignettes/pics/raster.pdf differ