Skip to content

Commit

Permalink
weighted cluster using DBSCAN, ref #27
Browse files Browse the repository at this point in the history
  • Loading branch information
Hussein-Mahfouz committed Feb 6, 2024
1 parent 554bf7a commit 57d8470
Showing 1 changed file with 75 additions and 108 deletions.
183 changes: 75 additions & 108 deletions code/demand_cluster_flows.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,59 +69,30 @@ od_demand_filtered = filter_matrix_by_distance(zones = study_area,

# ----- STEP 1: Get distance between flows


# --- a. prepare the data

# metric crs
od_demand_filtered <- od_demand_filtered %>%
st_transform(3857)

# Add columns for coordinates of startpoint (X, Y) and endpoint (U, V)

x <- od_demand_filtered %>%
od_demand_xyuv <- od_demand_filtered %>%
mutate(x = as_tibble(st_coordinates(st_startpoint(od_demand_filtered)))$X,
y = as_tibble(st_coordinates(st_startpoint(od_demand_filtered)))$Y,
u = as_tibble(st_coordinates(st_endpoint(od_demand_filtered)))$X,
v = as_tibble(st_coordinates(st_endpoint(od_demand_filtered)))$Y) %>%
st_drop_geometry()

# create flow ID column
x <- x %>%
od_demand_xyuv <- od_demand_xyuv %>%
mutate(flow_ID = paste(Origin, Destination, sep = "-"))


# filter to specific origin
x_origin_i <- x %>% filter(Origin == study_area$MSOA21CD[1])

# create grid
x_origin_i_grid <- expand_grid(flow_ID_a = x_origin_i$flow_ID,
flow_ID_b = x_origin_i$flow_ID)

# add the coordinate data
x_origin_i_grid2 <- x_origin_i_grid %>%
# --- add flow_a coordinates
inner_join(x %>%
select(flow_ID, x, y, u, v, distance_m),
by = c("flow_ID_a" = "flow_ID")) %>%
# rename coordinate columns by adding a suffix
rename_with(.fn = ~ paste0(.x, "_i"), .cols = c("x", "y", "u", "v", "distance_m")) %>%
# --- add flow_b coordinates
inner_join(x %>%
select(flow_ID, x, y, u, v, distance_m),
by = c("flow_ID_b" = "flow_ID")) %>%
# rename coordinate columns by adding a suffix
rename_with(.fn = ~ paste0(.x, "_j"), .cols = c("x", "y", "u", "v", "distance_m"))

# ----- get "Flow Distance" and "Flow Dissimilarity"

# Flow Distance
# --- b. Calculate distance matrix

x_origin_i_grid2 <- x_origin_i_grid2 %>%
mutate(fd = sqrt(0.5 * ((x_i - x_j)^2 + (y_i - y_j)^2) +
0.5 * ((u_i - u_j)^2 + (v_i - v_j)^2)),
fds = sqrt((0.5 * ((x_i - x_j)^2 + (y_i - y_j)^2) +
0.5 * ((u_i - u_j)^2 + (v_i - v_j)^2)) / (distance_m_i * distance_m_j)))



flow_distance = function(study_area, flows, geoid){
# Function to get flow distance and flow dissimilarity by iterating over Origins
flow_distance = function(study_area, flows, alpha = 1, beta = 1, geoid){
# empty list to store results for each origin
results <- vector(mode = "list", length = nrow(study_area))

Expand All @@ -135,7 +106,7 @@ flow_distance = function(study_area, flows, geoid){

# create grid
flows_origin_i_grid <- expand_grid(flow_ID_a = flows_origin_i$flow_ID,
flow_ID_b = flows_origin_i$flow_ID)
flow_ID_b = flows_origin_i$flow_ID)

# add the coordinate data
flows_origin_i_grid <- flows_origin_i_grid %>%
Expand All @@ -154,12 +125,13 @@ flow_distance = function(study_area, flows, geoid){

# ----- get "Flow Distance" and "Flow Dissimilarity"

# Flow Distance
flows_origin_i_grid <- flows_origin_i_grid %>%
mutate(fd = sqrt(0.5 * ((x_i - x_j)^2 + (y_i - y_j)^2) +
0.5 * ((u_i - u_j)^2 + (v_i - v_j)^2)),
fds = sqrt((0.5 * ((x_i - x_j)^2 + (y_i - y_j)^2) +
0.5 * ((u_i - u_j)^2 + (v_i - v_j)^2)) / (distance_m_i * distance_m_j)))
# Flow Distance
mutate(fd = sqrt(alpha * ((x_i - x_j)^2 + (y_i - y_j)^2) +
beta * ((u_i - u_j)^2 + (v_i - v_j)^2)),
# Flow Dissimilarity
fds = sqrt((alpha * ((x_i - x_j)^2 + (y_i - y_j)^2) +
beta * ((u_i - u_j)^2 + (v_i - v_j)^2)) / (distance_m_i * distance_m_j)))

# add results to list
results[[i]] <- flows_origin_i_grid
Expand All @@ -168,64 +140,62 @@ flow_distance = function(study_area, flows, geoid){
return(results)
}


test <- flow_distance(study_area = study_area,
flows = x,
# apply function to get distances
distances <- flow_distance(study_area = study_area,
flows = od_demand_xyuv,
alpha = 1,
beta = 1,
geoid = geoid_col)

test2 <- bind_rows(test)
# convert from list of dfs to one df
distances <- bind_rows(distances)

# replace na values with


# convert from long to wide
test2 %>%
# convert df to a distance matrix
dist_mat <- distances %>%
select(flow_ID_a, flow_ID_b, fds) %>%
pivot_wider(names_from = flow_ID_b, values_from = fds) %>%
column_to_rownames(var = "flow_ID_a") -> test3
column_to_rownames(var = "flow_ID_a")



# this is a sparse matrix (lot's of OD pairs without flow, so most pairs of OD pairs don't exist)
# replace NA with a very high number?
max(test3, na.rm = TRUE)

test3[is.na(test3)] <- max(test3, na.rm = TRUE) * 2

max(dist_mat, na.rm = TRUE)

# hdbscan
dist_mat[is.na(dist_mat)] <- max(dist_mat, na.rm = TRUE) * 2

# sample
test4 <- test3[1:5000, 1:5000]
test4 <- test3

# ------------------------- CLUSTERING USING DBSCAN ------------------------- #

# CLUSTERING
# ---------- STEP 1: Decide on Epsilon (ε) and minPTS

# ---------- HDBSCAN
# epsilon (ε) is a parameter used to define the maximum distance threshold
# for points to be considered as neighbors. minPTS is the minimum number of points
# needed in a radius (ε) for a cluster to be formed

res = dbscan::hdbscan(test4, minPts = 50)
# ----- K-nearest neighbor knee plot
kNNdistplot(dist_mat, k = 100)

# ----- Explore distance distribution of matrix

# ---------- OTHER (DBSAN / OPTICS)
# distance distribution of the flows
hist(distances$fds)

# in dbscan / optics, we need to define eps. epsilon (ε) is a parameter used to
# define the maximum distance threshold for points to be considered as neighbors

# we need to look at the distance distribution of the flows
hist(test2$fds)
# ----- Monte Carlo simulation to get mean value of distances

# if we consider all flows, the distance is skewed by flows that START at the same O
# or end at the same D, and this will give us clusters of flows that either start or end at the same point

n <- nrow(test2) # Number of rows in your dataframe
n <- nrow(distances) # Number of rows in your dataframe
num_samples <- 100 # Number of random samples
repetitions <- 1000 # Number of times to repeat the process

# Define a function to calculate the sum of "distance" column for random samples
mean_distance <- function() {
sample_indices <- sample(1:n, num_samples, replace = FALSE)
subset_df <- test2[sample_indices, ]
subset_df <- distances[sample_indices, ]
return(mean(subset_df$fds))
}

Expand All @@ -234,41 +204,45 @@ results <- replicate(repetitions, mean_distance())
hist(results)


# ---------- DBSCAN (with wights)
# ---------- STEP 2: Create Weights

# We have an OD matrix where each flow vector represents n commuters. We need to weight each vector
# by the number of commuters so that minPTS works correctly and doesn't treat each vector as 1 item.

# get weights
test4 %>%
# The weights vector needs to match the rows of the distance matrix

# --- get weights
w <- dist_mat %>%
rownames_to_column(var = "flow_ID") %>%
select(flow_ID) %>%
# add commuting data to
inner_join(od_demand_filtered %>%
st_drop_geometry() %>%
mutate(flow_ID = paste0(Origin, "-", Destination)) %>%
select(flow_ID, commute_all),
by = "flow_ID") -> w

by = "flow_ID")

# weight vector
w_vec <- as.vector(w$commute_all)

res = dbscan::dbscan(test4, minPts = 400, eps = 6, weights = w_vec)



# ---------- OPTICS
res = dbscan::optics(test4, eps = 1, minPts = 20)
plot(res)
# get eps_cl from plot y axis (threshold that seperates the clusters)
res = extractDBSCAN(res, eps_cl = 1.7)
# plot again to see clustering results
plot(res)
# cluster option 1: border points assigned to cluster
cluster_dbscan = dbscan::dbscan(dist_mat,
minPts = 250,
eps = 1.2,
weights = w_vec)

# attempt to precomute weighted distance matrix

# # Apply weights to the distance matrix
# weighted_distance_matrix <- test4 * weights %*% t(weights)
# # cluster option 2: border points not assigned to cluster
# cluster_dbscan = dbscan::dbscan(dist_mat,
# minPts = 200,
# eps = 1.3,
# weights = w_vec,
# borderPoints = FALSE)


#cluster_dbscan = dbscan::dbscan(dist_mat, minPts = 500, eps = (max(distances$fds) * 5/8), weights = w_vec)


unique(cluster_dbscan$cluster)



Expand All @@ -284,23 +258,23 @@ plot(res)


# get results
test5 <- test4 %>%
mutate(cluster = res$cluster)
cluster_dbscan_res <- dist_mat %>%
mutate(cluster = cluster_dbscan$cluster)

# prepare data for joining
test5 %>%
cluster_dbscan_res <- cluster_dbscan_res %>%
rownames_to_column(var = "flow_ID") %>%
select(flow_ID, cluster) -> test5
select(flow_ID, cluster)


# add geometry back

od_demand_filtered %>%
cluster_dbscan_res <- od_demand_filtered %>%
mutate(flow_ID = paste0(Origin, "-", Destination)) %>%
inner_join(test5, by = "flow_ID") -> test6
inner_join(cluster_dbscan_res, by = "flow_ID")

# plot
plot(test6["cluster"])
plot(cluster_dbscan_res["cluster"])



Expand All @@ -311,8 +285,9 @@ tm_shape(study_area) +
tm_shape(study_area) +
tm_fill(col = "grey95",
alpha = 0.5) +
tm_shape(test6 %>%
filter(distance_m > 20000, cluster < 20) %>%
tm_shape(cluster_dbscan_res %>%
#filter(cluster < 20) %>%
#filter(distance_m > 20000) %>%
mutate(cluster = as.factor(cluster))) +
tm_lines(lwd = "commute_all",
col = "cluster",
Expand All @@ -325,7 +300,7 @@ tm_shape(test6 %>%
legend.col.is.portrait = FALSE) +
tm_facets(by = "cluster",
free.coords = FALSE,
nrow = 4,
nrow = 3,
showNA = FALSE) +
tm_layout(fontfamily = 'Georgia',
main.title = "Clustering flows using HDBSCAN",
Expand All @@ -338,14 +313,6 @@ tm_shape(test6 %>%
frame = FALSE)


test6 %>% group_by(cluster) %>%
st_drop_geometry() %>%
summarise(commute_all = sum(commute_all)) -> p





# Option 2: Flow Dissimilarity


Expand Down

0 comments on commit 57d8470

Please sign in to comment.