## Generalized single linkage clustering: an illustration
## ======================================================
## Werner Stuetzle (wxs@u.washington.edu), 3-19-2009

## This document assumes that you have read the paper

## W. Stuetzle and R. Nugent. A generalized single linkage method for
## estimating the cluster tree of a density. Journal of Computational
## and Graphical Statistics, 2009 (to appear).

## We will use Fisher's iris data. They are getting old and tired, but
## they are available in R, everybody knows them, and for purposes of
## illustration they are fine.

## You must install the "gslclust" package before you can run the
## illustration code below. The package is part of the supplementary
## materials for the paper available on the JCGS Web site.

library(gslclust)
data(iris)
attach(iris)
X <- cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)

## First we must choose a density estimate. The package provides two
## options: the nearest neighbor estimate, and a kernel estimate with
## spherical Gaussian kernel.

##-----------------------------------------------------------------

## Let's first try the nearest neighbor estimate. We use the function
## gsl.make.nn.mdsfun to generate a "minimum density similarity
## function" nn.mdsfun(i, j), which returns the minimum of the nearest
## neighbor density estimate over the edge (line segment) connecting
## observations i and j. (Actually, that's not really true, but don't
## worry about it - the graph cluster tree comes out right.

nn.mdsfun <- gsl.make.nn.mdsfun(X)

## Next we use gsl.cluster to compute the graph cluster tree, the runt
## sizes and excess masses, and assorted other stuff. The output of
## gsl.cluster is not intended for your consumption. 

gsl.cluster.out <- gsl.cluster(X, nn.mdsfun)

## Let's have a look at the runt sizes

gsl.runt.size(gsl.cluster.out)[1:10]

## Here's what you should see
## [1] 50 38 16 14 14  7  5  4  4  4

## We scan the runt sizes form small to large and look for the first
## gap (see Section 5 and Section 7 of the paper). It seems to be
## between 7 and 14. So let's choose 14 as the runt size
## threshold. This means the pruned graph cluster tree will have 5
## internal nodes and 6 leaves. Let's compute the pruned tree.

gsl.cluster.out <- gsl.cluster(X, nn.mdsfun, pruning.criterion =
                               "size", pruning.threshold = 14)

## Draw the pruned graph cluster tree. The numbers below the interior
## nodes are the runt sizes. The numbers above the leaves are the leaf
## codes 

bla <- gsl.draw.cluster.tree(gsl.cluster.out, draw.runt.size = T,
                             draw.cluster.leaf.codes = T)

## Let's do some diagnostics to assess cluster separation. Before
## proceeding, have a looh at the help files for the functions
## gsl.gkl.diagnostic.plot and gsl.subtree.gkl.diagnostic.plot

gsl.gkl.diagnostic.plot(X, gsl.cluster.out, draw.runt.size = T,
                        draw.cluster.leaf.codes = T)

## Click on the root node. You will see two clearly bimodal
## histograms. So the daughters of the root node are are clearly
## separated in feature space.

## Do the same thing again, but click on the left daughter of the root
## node (the node with runt size 16)

gsl.gkl.diagnostic.plot(X, gsl.cluster.out, draw.runt.size = T,
                        draw.cluster.leaf.codes = T)

## Well, it does look bimodal but the dip is only about observations
## deep. Let's get all the observations in the left daughter of the
## root node and see how many there are

obs.in.left.daughter.of.root <-
  gsl.visually.select.observations.in.node(gsl.cluster.out)$in.node

## Click on the left daughter of the root node

sum(obs.in.left.daughter.of.root)
## [1] 50

## Only 50 observations. So the default of 20 breaks in the histograms
## may be too many. Let's try 10

gsl.gkl.diagnostic.plot(X, gsl.cluster.out, draw.runt.size = T,
                        draw.cluster.leaf.codes = T, n.breaks = 10)

## Not much bimodality left. Now check out the right daughter of the
## root node

gsl.gkl.diagnostic.plot(X, gsl.cluster.out, draw.runt.size = T,
                        draw.cluster.leaf.codes = T)

## Click on the right daughter of the root node. Pretty clear
## bimodality. I know, I know - it helps to know what one is looking
## for. Now check out the right grand-daughters of the root node - you
## get the idea

## We can also assess the separation between the leaves of a subtree

gsl.subtree.gkl.diagnostic.plot(X, gsl.cluster.out)

## Click on the right daughter of the root node. Focus on the black
## dots in the scatterplot, which are the observations in the leaf
## cores. There seem to be two clusters, not four. Then hit "return",
## and the observations in the leaves will be shown in different colors.

## Maybe a better runt size threshold would be 38, which would give a
## pruned cluster tree with three leaves

gsl.cluster.out <- gsl.cluster(X, nn.mdsfun, pruning.criterion =
                           "size", pruning.threshold = 38)

## Let's get the cluster labels and see how they align with the
## species

observation.labels <- gsl.observation.labels(gsl.cluster.out)
table(observation.labels, Species)
##                   Species
## observation.labels setosa versicolor virginica
##                  2     50          0         0
##                  6      0         45         1
##                  7      0          5        49

## Well how about it? The labels reflect the species almost perfectly.
## Let's check if the agreement gets better if we only look at
## observations in the leaf cores?

in.core <- gsl.observation.in.core(gsl.cluster.out)
table(observation.labels[in.core], Species[in.core])
##     setosa versicolor virginica
##   2     50          0         0
##   6      0         39         0
##   7      0          3        35

## There are still some Virginica that end up in the Versicolor
## cluster.

## Let's look how well the clusters obtained with runt size threshold
## 14 match up with the species

gsl.cluster.out <- gsl.cluster(X, nn.mdsfun, pruning.criterion =
                           "size", pruning.threshold = 14)
observation.labels <- gsl.observation.labels(gsl.cluster.out)
table(observation.labels, Species)
##                   Species
## observation.labels setosa versicolor virginica
##                 4      30          0         0
##                 5      20          0         0
##                 12      0         30         1
##                 13      0         15         0
##                 14      0          0        36
##                 15      0          5        13

## Each of the species is broken up into two clusters.

##-----------------------------------------------------------------

## Now try a kernel density estimate. We use a standard Gaussian
## kernel on the sphered data, which is the same as using a Gaussian
## kernel with the same covariance as the data.

XX <- sphere(X)

## Use least squares cross-validation to determine the bandwidth

cv.search.out <- cv.search(XX)
h <- cv.search.out$opt.smopar
h
##[1] 0.3510157

## Now make a minimum density similarity function based on the kernel
## density estimate

kernel.mdsfun <- gsl.make.kernel.mdsfun(XX, h, interactive = T)

## Setting "interactive = T" makes the function print out a "progress
## report" for the impatient.

gsl.cluster.out <- gsl.cluster(XX, kernel.mdsfun)
gsl.runt.size(gsl.cluster.out)[1:10]
## [1] 46  7  6  5  5  3  3  3  2  2

round(nrow(XX) * gsl.runt.excess.mass(gsl.cluster.out), 0)[1:10]
##  [1] 27  2  2  2  2  2  1  1  1  1

## Runt excess mass is easier to scan if we convert it from the
## probability scale to the "equivalent number of observations" scale.

## Both runt size and runt excess mass suggest two clusters.

gsl.cluster.out <- gsl.cluster(XX, nn.mdsfun, pruning.criterion =
                               "size", pruning.threshold = 46)

## Let's get the cluster labels and see how they align with the
## species

observation.labels <- gsl.observation.labels(gsl.cluster.out)
table(observation.labels, Species)
##                    Species
## observation.labels setosa versicolor virginica
##                  2     50          0         0
##                  3      0         50        50


## Seems like we are oversmoothing? We have observed more generally
## that nearest neighbor estimation tends to do not much worse, and
## often does better, than kernel density estimation.