Folosesc dtw pentru a calcula distanțele dintre mai multe serii și obțin rezultate ciudate. Observați că în eșantionul de date de mai jos primii 9 clienți sunt seturi identice (A==B==C, D==E==F și G==H==I). Rândurile rămase sunt doar pentru zgomot pentru a-mi permite să fac 8 clustere.
Mă aștept ca primele seturi să fie grupate cu partenerii lor identici. Acest lucru se întâmplă când calculez distanța pe datele originale, dar când scalez datele înainte de distanță/clustering, obțin rezultate diferite.
Distanțele dintre rândurile identice din datele originale sunt 0,0 (cum era de așteptat), dar cu datele scalate distanțele nu sunt 0,0 (nici măcar aproape). Aveți vreo idee de ce nu sunt la fel?
library(TSdist)
library(dplyr)
library(tidyr)
mydata = as_data_frame(read.table(textConnection("
cust P1 P2 P3 P4 P5 P6 P7 P8 P9 P10
1 A 1.1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
2 B 1.1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
3 C 1.1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
4 D 0.0 1.0 2.0 1.0 0.0 1.0 2.0 1.0 0.0 1.0
5 E 0.0 1.0 2.0 1.0 0.0 1.0 2.0 1.0 0.0 1.0
6 F 0.0 1.0 2.0 1.0 0.0 1.0 2.0 1.0 0.0 1.0
7 G 2.0 1.5 1.0 0.5 0.0 0.5 1.0 1.5 2.0 1.5
8 H 2.0 1.5 1.0 0.5 0.0 0.5 1.0 1.5 2.0 1.5
9 I 2.0 1.5 1.0 0.5 0.0 0.5 1.0 1.5 2.0 1.5
10 D2 1.0 2.0 1.0 0.0 1.0 2.0 1.0 0.0 1.0 2.0
11 E2 5.0 6.0 5.0 4.0 5.0 6.0 5.0 4.0 5.0 6.0
12 F2 9.0 10.0 9.0 8.0 9.0 10.0 9.0 8.0 9.0 10.0
13 G2 1.5 1.0 0.5 0.0 0.5 1.0 1.5 2.0 1.5 1.0
14 H2 5.5 5.0 4.5 4.0 4.5 5.0 5.5 6.0 5.5 5.0
15 I2 9.5 9.0 8.5 8.0 8.5 9.0 9.5 10.0 9.5 9.0
16 A3 1.0 1.0 0.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0
17 B3 5.0 5.0 5.0 5.0 5.0 3.0 8.0 5.0 5.0 5.0
18 C3 9.0 9.0 9.0 9.0 9.0 5.4 14.4 9.0 9.0 9.0
19 D3 0.0 1.0 2.0 1.0 0.0 1.0 1.0 2.0 0.0 1.0
20 E3 4.0 5.0 5.0 6.0 4.0 5.0 6.0 5.0 4.0 5.0
21 F3 8.0 9.0 10.0 9.0 9.0 9.0 9.0 9.0 8.0 9.0
22 G3 2.0 1.5 1.0 0.5 0.0 0.5 1.0 2.0 1.5 1.5
23 H3 6.0 5.5 5.0 4.5 4.0 5.0 4.5 5.5 6.0 5.5
24 I3 10.0 9.5 9.0 9.0 8.0 8.5 9.0 9.5 10.0 9.5
25 D4 0.0 3.0 6.0 3.0 0.0 3.0 6.0 3.0 0.0 5.0
26 E4 3.0 6.0 9.0 6.0 3.0 6.0 9.0 6.0 3.0 6.0
27 F4 4.0 6.0 10.0 7.0 5.0 6.0 11.0 8.0 5.0 7.0
28 D5 5.0 0.0 3.0 6.0 3.0 0.0 3.0 6.0 3.0 0.0
29 D6 9.0 6.0 3.0 6.0 9.0 6.0 3.0 6.0 9.0 6.0
30 D7 9.0 11.0 5.0 4.0 6.0 10.0 7.0 5.0 6.0 11.0
31 Dw 0.0 0.8 1.4 2.0 1.0 0.0 2.0 0.0 1.0 2.0
32 Ew 4.0 4.8 5.4 6.0 5.0 4.0 6.0 4.0 5.0 6.0
33 Fw 8.0 8.8 9.4 10.0 9.0 8.0 10.0 8.0 9.0 10.0
34 Gw 2.0 1.5 1.0 0.5 0.0 1.0 2.0 1.5 1.3 1.1
35 Hw 6.0 5.5 5.0 4.5 4.0 5.0 6.0 5.5 5.3 5.1
36 Iw 10.0 9.5 9.0 8.5 8.0 9.0 10.0 9.5 9.3 9.1"),
header = TRUE, stringsAsFactors = FALSE))
k=8
# create a scale version of mydata (raw data - mean) / std dev
mydata_long = mydata %>%
mutate (mean = apply(mydata[,2:ncol(mydata)],1,mean,na.rm = T)) %>%
mutate (sd = apply(mydata[,2:(ncol(mydata))],1,sd,na.rm = T))%>%
gather (period,value,-cust,-mean,-sd) %>%
mutate (sc = (value-mean)/sd)
mydata_sc = mydata_long[,-c(2,3,5)] %>%
spread(period,sc)
# dtw
dtw_dist = TSDatabaseDistances(mydata[2:ncol(mydata)], distance = "dtw",lag.max= 2) #distance
dtw_clus = hclust(dtw_dist, method="ward.D2") # Cluster
dtw_res = data.frame(cutree(dtw_clus, k)) # cut dendrogram into 9 clusters
# dtw (w scaled data)
dtw_sc_dist = TSDatabaseDistances(mydata_sc[2:ncol(mydata_sc)], distance = "dtw",lag.max= 2) #distance
dtw_sc_clus = hclust(dtw_sc_dist, method="ward.D2") # Cluster
dtw_sc_res = data.frame(cutree(dtw_sc_clus, k)) # cut dendrogram into 9 clusters
results = cbind (dtw_res,dtw_sc_res)
names(results) = c("dtw", "dtw_scaled")
print(results)
dtw dtw_scaled
1 1 1
2 1 2
3 1 1
4 1 2
5 1 1
6 1 2
7 1 3
8 1 4
9 1 3
10 1 3
11 2 3
12 3 4
13 1 5
14 2 6
15 3 3
16 1 4
17 2 3
18 4 3
19 1 6
20 2 3
21 3 4
22 1 3
23 2 3
24 3 6
25 5 7
26 6 8
27 7 7
28 5 7
29 6 7
30 8 8
31 1 7
32 2 7
33 3 7
34 1 8
35 2 7
36 3 7