Skip to content
Phuoc Do edited this page Mar 24, 2016 · 15 revisions

WikiExamples

Some datalib usage examples:

Wrangling Campaign Finance Data

Interactive notebook on nxsheet.com

// runnable in node.js (after `npm install datalib`, of course!)

// let's look at political action committees in 2013-2014
// first download and unzip the committee file from the FEC
//   on the command line (Mac or Linux) you can run:
//   `curl ftp://ftp.fec.gov/FEC/2014/cm14.zip | funzip > cm.txt`
// in any case, we now assume 'cm.txt' is in the working directory

// load datalib
var dl = require('datalib');

// data does not include header, so we include it manually here
// http://www.fec.gov/finance/disclosure/metadata/DataDictionaryCommitteeMaster.shtml
var fields = [
  "CMTE_ID",              // committee id
  "CMTE_NM",              // committee name
  "TRES_NM",              // treasurer name
  "CMTE_ST1",             // street address line 1
  "CMTE_ST2",             // street address line 2
  "CMTE_CITY",            // city
  "CMTE_ST",              // state
  "CMTE_ZIP",             // zip code
  "CMTE_DSGN",            // designation
  "CMTE_TP",              // type
  "CMTE_PTY_AFFILIATION", // party affiliation
  "CMTE_FILING_FREQ",     // filing frequency
  "ORG_TP",               // interest group category
  "CONNECTED_ORG_NM",     // connected organization name (if any)
  "CAND_ID"               // candidate id (if any)
];

// load pipe-delimited committee file with custom header
var cm = dl.dsv('cm.txt', {delimiter: '|', header: fields});

// examine first 20 rows of the table
console.log(dl.format.table(cm, {limit: 20}));

// examine summary profiles for each variable
console.log(dl.format.summary(cm));
// notice that there are many fewer unique treasurer names than rows?
// let's investigate!

// return a sorted table of rows with duplicated field values
function duplicates(data, field) {
  // ensure field is an accessor function
  field = dl.$(field); 

  // construct an array of values that appear more than once
  var dups = dl.groupby(field).                   // group-by field
    count().execute(data).                        // count occurrences
    filter(function(d) { return d.count > 1; }).  // remove non-duplicates
    filter(dl.$valid(field)).                     // remove nulls, etc
    map(field);                                   // extract values

  // filter and sort the input table
  return data.
    filter(dl.$in(field, dups)).  // retain rows with duplicated values
    sort(dl.comparator(field));   // sort ascending by field
}

// examine first 100 duplicated treasurer names
console.log(dl.format.table(duplicates(cm, 'TRES_NM'), {limit: 100}));
// discover that many committees have the same treasurer
// ...including committees with very different addresses
// perhaps this can provide insight into committee orchestration?

Find Related Variables via Mutual Information

// runnable in the console at http://vega.github.io/datalib/

// load cars data set
var cars = dl.json('data/cars.json');

// extract variable names from the first record
// filter out high-cardinality string fields
var fields = dl.keys(cars[0]).filter(function(f) {
  return dl.type(cars, f) !== 'string' || dl.count.distinct(cars, f) < 20;
});

// generate binning functions for each variable
// we will be naive for now and use automatically-chosen bins
// this chooses binning schemes for numeric and date-typed data
// string-typed data will be returned as-is
var b = fields.map(function(f) { return dl.$bin(cars, f); });

var pairs = [];
for (var i=0; i<b.length; ++i) {
  for (var j=i+1; j<b.length; ++j) {
    // extract names of binned variables
    var name_i = dl.name(b[i]);
    var name_j = dl.name(b[j]);

    // compute binned counts for joint distribution
    var cnt = dl.groupby(b[i], b[j]).count().execute(cars);

    // compute mutual information distance between binned variables
    var d = dl.mutual.dist(cnt, name_i, name_j, 'count');

    // collect results
    pairs.push({var1: name_i, var2: name_j, dist: d});
  }
}

// sort variable pairs from smallest to largest distance
pairs.sort(dl.comparator('+dist'));

// examine top 10 related variables
console.log(dl.format.table(pairs, {limit: 10}));

Multi-Table Aggregation

Datalib aggregators are designed to be run with data objects that have the same properties and data types. However, it may sometimes be useful to compute aggregates over multiple data sets with different schemas. As this example shows, multi-table aggregation can be achieved by simply extracting the desired values into a uniform format (below this is simply the raw values) and feeding those into a shared groupby instance.

// construct a streaming groupby instance
var agg = dl.groupby().stream(true).summarize([{
  name: 'value',        // name this summary measure 'value'
  get:  dl.identity,    // use raw input values
  ops:  ['min', 'max'], // compute min and max
  as:   ['min', 'max']  // write results to 'min' and 'max'
}]);

var domain1 = [{a:1}, {a:3}, {a:6}]; // data set 1
var domain2 = [{b:9}, {b:2}];        // data set 2
var f1 = dl.$('a');  // accessor 1
var f2 = dl.$('b');  // accessor 2

// add both data sets to aggregate
var r = agg
  .insert(domain1.map(f1))
  .insert(domain2.map(f2))
  .result()[0];
console.log([r.min, r.max]); // [1, 9]

// remove values from both data sets
r = agg
  .remove([f1(domain1[0])])
  .remove([f2(domain2[0])])
  .result()[0];
console.log([r.min, r.max]); // [2, 6]
Clone this wiki locally