-
Notifications
You must be signed in to change notification settings - Fork 132
Examples
Phuoc Do edited this page Mar 24, 2016
·
15 revisions
Wiki ▸ Examples
Some datalib usage examples:
- Wrangling Campaign Finance Data - custom text format, analysis of duplicate values
- Find Related Variables via Mutual Information - mining associations among variables
- Multi-Table Aggregation - computing shared aggregates over diverse input data
Interactive notebook on nxsheet.com
// runnable in node.js (after `npm install datalib`, of course!)
// let's look at political action committees in 2013-2014
// first download and unzip the committee file from the FEC
// on the command line (Mac or Linux) you can run:
// `curl ftp://ftp.fec.gov/FEC/2014/cm14.zip | funzip > cm.txt`
// in any case, we now assume 'cm.txt' is in the working directory
// load datalib
var dl = require('datalib');
// data does not include header, so we include it manually here
// http://www.fec.gov/finance/disclosure/metadata/DataDictionaryCommitteeMaster.shtml
var fields = [
"CMTE_ID", // committee id
"CMTE_NM", // committee name
"TRES_NM", // treasurer name
"CMTE_ST1", // street address line 1
"CMTE_ST2", // street address line 2
"CMTE_CITY", // city
"CMTE_ST", // state
"CMTE_ZIP", // zip code
"CMTE_DSGN", // designation
"CMTE_TP", // type
"CMTE_PTY_AFFILIATION", // party affiliation
"CMTE_FILING_FREQ", // filing frequency
"ORG_TP", // interest group category
"CONNECTED_ORG_NM", // connected organization name (if any)
"CAND_ID" // candidate id (if any)
];
// load pipe-delimited committee file with custom header
var cm = dl.dsv('cm.txt', {delimiter: '|', header: fields});
// examine first 20 rows of the table
console.log(dl.format.table(cm, {limit: 20}));
// examine summary profiles for each variable
console.log(dl.format.summary(cm));
// notice that there are many fewer unique treasurer names than rows?
// let's investigate!
// return a sorted table of rows with duplicated field values
function duplicates(data, field) {
// ensure field is an accessor function
field = dl.$(field);
// construct an array of values that appear more than once
var dups = dl.groupby(field). // group-by field
count().execute(data). // count occurrences
filter(function(d) { return d.count > 1; }). // remove non-duplicates
filter(dl.$valid(field)). // remove nulls, etc
map(field); // extract values
// filter and sort the input table
return data.
filter(dl.$in(field, dups)). // retain rows with duplicated values
sort(dl.comparator(field)); // sort ascending by field
}
// examine first 100 duplicated treasurer names
console.log(dl.format.table(duplicates(cm, 'TRES_NM'), {limit: 100}));
// discover that many committees have the same treasurer
// ...including committees with very different addresses
// perhaps this can provide insight into committee orchestration?
// runnable in the console at http://vega.github.io/datalib/
// load cars data set
var cars = dl.json('data/cars.json');
// extract variable names from the first record
// filter out high-cardinality string fields
var fields = dl.keys(cars[0]).filter(function(f) {
return dl.type(cars, f) !== 'string' || dl.count.distinct(cars, f) < 20;
});
// generate binning functions for each variable
// we will be naive for now and use automatically-chosen bins
// this chooses binning schemes for numeric and date-typed data
// string-typed data will be returned as-is
var b = fields.map(function(f) { return dl.$bin(cars, f); });
var pairs = [];
for (var i=0; i<b.length; ++i) {
for (var j=i+1; j<b.length; ++j) {
// extract names of binned variables
var name_i = dl.name(b[i]);
var name_j = dl.name(b[j]);
// compute binned counts for joint distribution
var cnt = dl.groupby(b[i], b[j]).count().execute(cars);
// compute mutual information distance between binned variables
var d = dl.mutual.dist(cnt, name_i, name_j, 'count');
// collect results
pairs.push({var1: name_i, var2: name_j, dist: d});
}
}
// sort variable pairs from smallest to largest distance
pairs.sort(dl.comparator('+dist'));
// examine top 10 related variables
console.log(dl.format.table(pairs, {limit: 10}));
Datalib aggregators are designed to be run with data objects that have the same properties and data types. However, it may sometimes be useful to compute aggregates over multiple data sets with different schemas. As this example shows, multi-table aggregation can be achieved by simply extracting the desired values into a uniform format (below this is simply the raw values) and feeding those into a shared groupby instance.
// construct a streaming groupby instance
var agg = dl.groupby().stream(true).summarize([{
name: 'value', // name this summary measure 'value'
get: dl.identity, // use raw input values
ops: ['min', 'max'], // compute min and max
as: ['min', 'max'] // write results to 'min' and 'max'
}]);
var domain1 = [{a:1}, {a:3}, {a:6}]; // data set 1
var domain2 = [{b:9}, {b:2}]; // data set 2
var f1 = dl.$('a'); // accessor 1
var f2 = dl.$('b'); // accessor 2
// add both data sets to aggregate
var r = agg
.insert(domain1.map(f1))
.insert(domain2.map(f2))
.result()[0];
console.log([r.min, r.max]); // [1, 9]
// remove values from both data sets
r = agg
.remove([f1(domain1[0])])
.remove([f2(domain2[0])])
.result()[0];
console.log([r.min, r.max]); // [2, 6]