Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: make AGNES algorithm closer to R #11

Merged
merged 8 commits into from
Jul 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,26 @@ Hierarchical clustering algorithms in JavaScript.

## Installation

`npm install ml-hclust`
`npm i ml-hclust`

## [API Documentation](https://mljs.github.io/hclust/)

## Methods
## Usage

Generate a clustering hierarchy.
### AGNES

```js
const { agnes } = require('ml-hclust');

const tree = agnes(data, {
method: 'ward',
});
```

## Implemented algorithms

- [x] [AGNES](http://dx.doi.org/10.1002/9780470316801.ch5) (AGglomerative NESting): Continuously merge nodes that have the least dissimilarity.
- [x] [DIANA](http://eu.wiley.com/WileyCDA/WileyTitle/productCd-0470276800.html) (Divisive ANAlysis): The process starts at the root with all the points as one cluster and recursively splits the higher level clusters to build the dendrogram.
- [ ] [DIANA](http://eu.wiley.com/WileyCDA/WileyTitle/productCd-0470276800.html) (Divisive ANAlysis): The process starts at the root with all the points as one cluster and recursively splits the higher level clusters to build the dendrogram.
- [ ] [BIRCH](http://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf) (Balanced Iterative Reducing and Clustering using Hierarchies): Incrementally construct a CF (Clustering Feature) tree, a hierarchical data structure for multiphase clustering
- [ ] [CURE](http://www.cs.bu.edu/fac/gkollios/ada05/LectNotes/guha98cure.pdf) (Clustering Using REpresentatives):
- [ ] [CHAMELEON](http://www.google.ch/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0CCQQFjAAahUKEwj6t4n_sZbGAhXDaxQKHXCLCmQ&url=http%3A%2F%2Fglaros.dtc.umn.edu%2Fgkhome%2Ffetch%2Fpapers%2FchameleonCOMPUTER99.pdf&ei=kDqBVfqvKsPXUfCWqqAG&usg=AFQjCNEYcGqCxN5N_GlP4Z__UF09aHegQg&sig2=9JkxZ5VS7iDbiJT-imX5Pg&bvm=bv.96041959,d.d24&cad=rja)
Expand All @@ -32,6 +42,7 @@ npm test
## Authors

- [Miguel Asencio](/~https://github.com/maasencioh)
- [Michael Zasso](/~https://github.com/targos)

## License

Expand Down
29 changes: 29 additions & 0 deletions experiment.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import { agnes } from './src';

const d = [
[0, 17, 21, 31, 23],
[17, 0, 30, 34, 21],
[21, 30, 0, 28, 39],
[31, 34, 28, 0, 43],
[23, 21, 39, 43, 0],
];

const c = agnes(d, {
method: 'ward',
isDistanceMatrix: true,
});

const heights = [];
c.traverse((cluster) => {
if (cluster.isLeaf) {
console.log(cluster.index + 1);
}
if (cluster.height > 0) {
heights.push(cluster.height);
}
});

heights.sort((h1, h2) => h1 - h2);

console.log(heights);
// console.log(require('util').inspect(c, { depth: Infinity, colors: true }));
37 changes: 20 additions & 17 deletions hclust.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,43 @@ export type AgglomerationMethod =
| 'single'
| 'complete'
| 'average'
| 'upgma'
| 'wpgma'
| 'median'
| 'wpgmc'
| 'centroid'
| 'ward';
| 'upgmc'
| 'ward'
| 'ward2';

export interface AgnesOptions<T> {
distanceFunction?: (a: T, b: T) => number;
method?: AgglomerationMethod;
isDistanceMatrix?: boolean;
}

export interface DianaOptions<T> {
distanceFunction?: (a: T, b: T) => number;
}
// export interface DianaOptions<T> {
// distanceFunction?: (a: T, b: T) => number;
// }

export interface Cluster {
children: Cluster[];
distance: number;
index: ClusterLeaf[];
height: number;
size: number;
index: number;
isLeaf: boolean;
cut: (threshold: number) => Cluster[];
group: (minGroups: number) => Cluster;
group: (groups: number) => Cluster;
traverse: (cb: (cluster: Cluster) => void) => void;
}

export interface ClusterLeaf extends Cluster {
children: [];
distance: 0;
index: number;
indices: () => number[];
}

export function agnes<T = number[]>(
data: T[],
options?: AgnesOptions<T>,
): Cluster;

export function diana<T = number[]>(
data: T[],
options?: DianaOptions<T>,
): Cluster;
// export function diana<T = number[]>(
// data: T[],
// options?: DianaOptions<T>,
// ): Cluster;
19 changes: 10 additions & 9 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "ml-hclust",
"version": "2.0.3",
"version": "3.0.0-1",
"description": "Hierarchical clustering algorithms",
"main": "hclust.js",
"module": "src/index.js",
Expand Down Expand Up @@ -41,18 +41,19 @@
},
"homepage": "/~https://github.com/mljs/hclust",
"devDependencies": {
"@babel/plugin-transform-modules-commonjs": "^7.4.4",
"eslint": "^5.16.0",
"@babel/plugin-transform-modules-commonjs": "^7.5.0",
"eslint": "^6.0.1",
"eslint-config-cheminfo": "^1.20.1",
"eslint-plugin-import": "^2.17.2",
"eslint-plugin-jest": "^22.5.1",
"jest": "^24.7.1",
"rollup": "^1.10.1"
"eslint-plugin-import": "^2.18.0",
"eslint-plugin-jest": "^22.7.2",
"esm": "^3.2.25",
"jest": "^24.8.0",
"rollup": "^1.16.7"
},
"dependencies": {
"heap": "^0.2.6",
"ml-array-median": "^1.1.1",
"ml-distance-euclidean": "^2.0.0",
"ml-distance-matrix": "^1.0.0"
"ml-distance-matrix": "^2.0.0",
"ml-matrix": "^6.1.2"
}
}
69 changes: 43 additions & 26 deletions src/Cluster.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,29 @@ import Heap from 'heap';
export default class Cluster {
constructor() {
this.children = [];
this.distance = -1;
this.index = [];
this.height = 0;
this.size = 1;
this.index = -1;
this.isLeaf = false;
}

/**
* Creates an array of values where maximum distance smaller than the threshold
* Creates an array of clusters where the maximum height is smaller than the threshold
* @param {number} threshold
* @return {Array <Cluster>}
* @return {Array<Cluster>}
*/
cut(threshold) {
if (threshold < 0) throw new RangeError('Threshold too small');
var root = new Cluster();
root.children = this.children;
root.distance = this.distance;
root.index = this.index;
var list = [root];
var ans = [];
if (typeof threshold !== 'number') {
throw new TypeError('threshold must be a number');
}
if (threshold < 0) {
throw new RangeError('threshold must be a positive number');
}
let list = [this];
const ans = [];
while (list.length > 0) {
var aux = list.shift();
if (threshold >= aux.distance) {
const aux = list.shift();
if (threshold >= aux.height) {
ans.push(aux);
} else {
list = list.concat(aux.children);
Expand All @@ -32,22 +35,22 @@ export default class Cluster {
}

/**
* Merge the leaves in the minimum way to have 'minGroups' number of clusters
* @param {number} minGroups - Them minimum number of children the first level of the tree should have
* Merge the leaves in the minimum way to have `groups` number of clusters.
* @param {number} groups - Them number of children the first level of the tree should have.
* @return {Cluster}
*/
group(minGroups) {
if (!Number.isInteger(minGroups) || minGroups < 1) {
throw new RangeError('Number of groups must be a positive integer');
group(groups) {
if (!Number.isInteger(groups) || groups < 1) {
throw new RangeError('groups must be a positive integer');
}

const heap = new Heap(function (a, b) {
return b.distance - a.distance;
const heap = new Heap((a, b) => {
return b.height - a.height;
});

heap.push(this);

while (heap.size() < minGroups) {
while (heap.size() < groups) {
var first = heap.pop();
if (first.children.length === 0) {
break;
Expand All @@ -57,25 +60,39 @@ export default class Cluster {

var root = new Cluster();
root.children = heap.toArray();
root.distance = this.distance;
root.height = this.height;

return root;
}

/**
* Traverses the tree depth-first and provide callback to be called on each individual node
* Traverses the tree depth-first and calls the provided callback with each individual node
* @param {function} cb - The callback to be called on each node encounter
* @type {Cluster}
*/
traverse(cb) {
function visit(root, callback) {
callback(root);
if (root.children) {
for (var i = root.children.length - 1; i >= 0; i--) {
visit(root.children[i], callback);
for (const child of root.children) {
visit(child, callback);
}
}
}
visit(this, cb);
}

/**
* Returns a list of indices for all the leaves of this cluster.
* The list is ordered in such a way that a dendrogram could be drawn without crossing branches.
* @returns {Array<number>}
*/
indices() {
const result = [];
this.traverse((cluster) => {
if (cluster.isLeaf) {
result.push(cluster.index);
}
});
return result;
}
}
10 changes: 0 additions & 10 deletions src/ClusterLeaf.js

This file was deleted.

31 changes: 31 additions & 0 deletions src/__tests__/agnes.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import * as data from '../../testData';

import { agnes } from '..';

test('AGNES with feature matrix', () => {
const clust = agnes(data.features1);
expect(clust.height).toBeCloseTo(7.2111, 4);
});

test('AGNES with distance matrix', () => {
var clust = agnes(data.distanceMatrix1, { isDistanceMatrix: true });
expect(clust.height).toBeCloseTo(7.2111, 4);
});

test('AGNES with distance matrix 2', () => {
const clust = agnes(data.distanceMatrix2, { isDistanceMatrix: true });
expect(clust.height).not.toBeGreaterThan(1);
});

test('AGNES centroid', () => {
const clust = agnes(data.distanceMatrix2, {
isDistanceMatrix: true,
method: 'centroid',
});

clust.traverse((node) => {
expect(typeof node.height).toBe('number');
expect(node.height).not.toBe(NaN);
expect(node.height).not.toBeLessThan(0);
});
});
46 changes: 46 additions & 0 deletions src/__tests__/cluster.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import * as data from '../../testData';

import { agnes } from '..';

test('size', () => {
const clust = agnes(data.features1);
expect(clust.size).toBe(10);
const [child1, child2] = clust.children;
expect(child1.size).toBe(5);
expect(child2.size).toBe(5);
});

test('cut', () => {
const clust = agnes(data.features1);
expect(clust.cut(1.5)).toHaveLength(5);
});

test('group', () => {
const clust = agnes(data.features1);
const group = clust.group(8);
expect(group.children).toHaveLength(8);
});

test('indices', () => {
const clust = agnes(data.features1);
const indices = clust.indices();
expect(indices).toHaveLength(data.features1.length);
expect(indices).toStrictEqual([6, 5, 9, 8, 7, 3, 1, 0, 4, 2]);
});

test('traverse, isLeaf and index', () => {
const clust = agnes(data.features1);
let other = 0;
let leaves = 0;
clust.traverse((cluster) => {
if (cluster.isLeaf) {
leaves++;
expect(cluster.index).toBeGreaterThan(-1);
} else {
other++;
expect(cluster.index).toBe(-1);
}
});
expect(other).toBe(9);
expect(leaves).toBe(10);
});
Loading