Import data

load("./CancerSys.RData")

Perform Fuzzy Clustering

source("./FSTS.Clustering.R")

at.least.1.obs <- all.data %>% group_by(Patient) %>% 
                                  filter(n() > 1 && sum(!is.na(NTX)) > 1) %>% as.data.frame()

data.in <- at.least.1.obs[, c("Patient", "time", "NTX")]
data.in <- reshape(data = data.in, v.names = "NTX", timevar = "time", idvar = "Patient",
             direction = "wide")

out <- FSTS.clustering(data = data.in, num.clust = 6, part.coef = 2, stop.crit = 1e-5,
                       max.iter = 100, id.var = "Patient", time.vector = c(0,1,3,6,9,12),
                       value.vars = c("NTX.0", "NTX.1", "NTX.3", "NTX.6", "NTX.9",
                                      "NTX.12"))

data.imputed <- out$data.imputed
data.imputed$cluster <- apply(out$U, 2, FUN = which.max)

data.imputed[, c("Sex", "Primary", "Age.Diagnosis",
                 "NSRE", "Extra.Mets", "Death.Status",
                 "Survival.after.begining.ZA.mo")] <- 
  at.least.1.obs[!duplicated(at.least.1.obs$Patient), #Gets the first row of each Patient
                 c("Sex", "Primary", "Age.Diagnosis",
                     "NSRE", "Extra.Mets", "Death.Status",
                     "Survival.after.begining.ZA.mo")]

data.imputed.long <- reshape(data = data.imputed, idvar = "Patient",
                             varying = c("NTX.0", "NTX.1", "NTX.3", "NTX.6", "NTX.9",
                                                              "NTX.12"),
                             times = c(0,1,3,6,9,12), direction = "long")

data.imputed.long <- data.imputed.long[order(data.imputed.long$Patient,
                                             data.imputed.long$time), ]

data.imputed.long <- data.imputed.long %>% filter(time <= Survival.after.begining.ZA.mo)
data.imputed.long$is.missing <- is.na(at.least.1.obs$NTX)

Plot the clusters

library(ggplot2)

sd.by.cluster <- data.imputed.long %>% group_by(Patient, cluster) %>% 
  summarize(sd = sd(NTX, na.rm = TRUE), mean = mean(NTX, na.rm = TRUE)) %>% 
  group_by(cluster) %>% summarise(sd = mean(sd), mean = mean(mean))

clust <- sweep(out$clusters[, -(1:2)], MARGIN = 1, sd.by.cluster$sd, '*') + sd.by.cluster$mean
clust <- as.data.frame(clust)
clust$cluster <- 1:6
clust$is.missing <- FALSE
clust <- reshape(data = clust, idvar = "clust", varying = c("V1", "V2", "V3",
                                                            "V4", "V5", "V6"),
                 direction = "long", v.names = "NTX", times = c(0,1,3,6,9,12))
clust$clust <- NULL
clust$type <- "Centroid"

plot.data <- data.imputed.long
plot.data$type <- "Trajectories"
plot.data <- bind_rows(plot.data, clust)

for(i in 1:6){
  plot.data[plot.data$cluster == i, "cluster"] <- paste("Cluster ", i)
}

clustering.summary <- plot.data %>% filter(type == "Trajectories") %>% 
  group_by(Patient, cluster) %>% summarise(Value = n()) %>% group_by(cluster) %>%
  summarize(Value = n())

ggplot(data = plot.data %>% mutate(NTX = ifelse(is.missing, NA, NTX)), 
       aes(x = time, y = NTX, group = Patient)) + 
  geom_point(aes(color = type, alpha = type)) + 
  geom_line(aes(color = type, alpha = type)) + 
  geom_text(data = clustering.summary, aes(label = Value), inherit.aes = FALSE,
            x = +12, y = +1650) +
  scale_alpha_manual(values = c(1, 0.5)) +
  scale_colour_manual(values = c("black", "#00B6EB")) +
  scale_x_continuous(breaks = c(0,1,3,6,9,12)) +
  xlab("Time [months]") + 
  ylab("NTX [nmol BCE/mmol creatinine]") +
  facet_wrap(~cluster) + 
  labs(color = "Legend", alpha = "Legend")
## Warning: Removed 178 rows containing missing values (geom_point).
## Warning: Removed 133 rows containing missing values (geom_path).

Save results

data.with.clusts <- data.imputed.long

save(data.with.clusts, file="./6Clusters.RData")