'''
-------------------------------------------------------------------------
References:
[1] M. Ester, H. Kriegel, J. Sander, X. Xu, A density-based algorithm for
discovering clusters in large spatial databases with noise, proc.
2nd Int. Conf. on Knowledge Discovery and Data Mining, Portland, OR, 1996,
p. 226, available from:
www.dbs.informatik.uni-muenchen.de/cgi-bin/papers?query=--CO
[2] M. Daszykowski, B. Walczak, D. L. Massart, Looking for
Natural Patterns in Data. Part 1: Density Based Approach,
Chemom. Intell. Lab. Syst. 56 (2001) 83-92
-------------------------------------------------------------------------
Written by Michal Daszykowski
Department of Chemometrics, Institute of Chemistry,
The University of Silesia
December 2004
http://www.chemometria.us.edu.pl

ported to python Dec, 2008 by Brian H. Clowers, Pacific Northwest National Laboratory.
Dependencies include scipy, numpy, and hcluster.

further adapted by Robert McGibbon, Jan 2011.
'''

import numpy as np
import scipy.spatial.distance
import scipy.special as special


def set2List(indArray):
    ind = []
    for item in indArray:
        ind.append(item.tolist())

    return ind

def dbscan(distance_matrix, k, eps):
    '''Run density-based clustering (DBSCAN).
    
    distance_matrix should be the condensed or full distance matrix of your data.
    It can be computed with you favorite distance metric.
    
    k is the minimum number of objects that are considered a cluster
    
    eps is the so called neighborhood radius
    
    
    Returns: cClass, a vector of the assignments of each data point, and tType,
    a vector specifying type of the i-th object (core: 1, border: 0, outlier: -1)
    '''
    
    dist = scipy.spatial.distance.squareform(distance_matrix, force='to_matrix')
    m = dist.shape[0]
    
    x = np.arange(0,m)
    type = np.zeros(m)
    touched = np.zeros(m)
    no = 1
    
    tType = np.zeros(m)
    cClass = np.zeros(m)
    
    for i in xrange(m): # for each object
        if touched[i] == 0:
            ob = x[i]
            D = dist[ob]
            ind = np.where(D<=eps)
            ind = set2List(ind)[0]
            
            if len(ind)>1 and len(ind)<(k+1):
                tType[i] = 0
                cClass[i] = 0
            
            if len(ind) == 1:
                tType[i] = -1
                cClass[i] = -1
                touched[i] = 1
            
            if len(ind) >= k+1:
                tType[i] = 1
                cClass[ind] = np.ones(len(ind))*no
                
                for l in ind:
                    ob2 = x[l]
                    touched[l]=1
                    D2 = dist[ob2]
                    i1 = np.where(D2<=eps)
                    i1 = set2List(i1)[0]
                    if len(i1) > 1:
                        cClass[i1] = no
                        if len(i1)>=k+1:
                            tType[ob2] = 1
                        else:
                            tType[ob2] = 0
                        
                        for j in xrange(len(i1)):
                            if touched[i1[j]] == 0:
                                touched[i1[j]]=1
                                ind.append(i1[j])
                                cClass[i1[j]] = no
                
                no+=1
    i1 = np.where(cClass == 0)
    i1 = set2List(i1)[0]
    cClass[i1] = -1
    tType[i1] = -1
    return cClass, tType


def epsilon(x,k):
    '''Analytic way of estimating neighborhood radius for DBSCAN
    
    input:
    x - data matrix (m,n); m-objects, n-variables
    k - number of objects in a neighborhood of an object
        (minimal number of objects considered as a cluster)
    '''
    
    if len(x.shape)>1:
        m,n = x.shape
    else:
        m = x.shape[0]
        n == 1
    
    prod = np.prod(x.max(axis=0) - x.min(axis=0))
    gamma = special.gamma(0.5*n + 1)
    denom = m * np.sqrt(np.pi**n)
    
    Eps = ((prod * k * gamma) / denom)**(1.0/n)
    
    return Eps


if __name__ == "__main__":
    import matplotlib.pyplot as pp
    import scipy
    x1 = scipy.rand(30,2)*2
    x2 = (scipy.rand(40,2)*0.5+1)*4
    xy = np.concatenate((x1,x2))

    dist = scipy.spatial.distance.pdist(xy)
    print epsilon(xy, 5)
    assignments, label = dbscan(dist, 5, eps=0.9)
    
    for c in np.unique(assignments):
        ind = np.where(assignments == c)
        temp = xy[ind]
        #pp.plot(temp[:,0], temp[:,1], 's', label=str(c))

    #pp.legend()
    #pp.show()