Prepare

#install.packages('rJava')
source('MINE.r')

Datasets

WHO

Social, economic, health, and political indicators from the World Health Organization and GapMinder.

data_WHO = read.csv('WHO.csv')
nrow(data_WHO) # 202 country
## [1] 202
ncol(data_WHO) # 358 indicators
## [1] 358
data_WHO[1:6,1:5]
##               Country CountryID Continent Adolescent.fertility.rate....
## 1         Afghanistan         1         1                           151
## 2             Albania         2         2                            27
## 3             Algeria         3         3                             6
## 4             Andorra         4         2                            NA
## 5              Angola         5         3                           146
## 6 Antigua and Barbuda         6         4                            NA
##   Adult.literacy.rate....
## 1                    28.0
## 2                    98.7
## 3                    69.9
## 4                      NA
## 5                    67.4
## 6                      NA
MINE('WHO.csv', 1)
## **********************************************************
## MINE version 1.0.1d
## Copyright 2011 by David Reshef and Yakir Reshef.
## 
## This application is licensed under a Creative Commons
## Attribution-NonCommercial-NoDerivs 3.0 Unported License.
## See
## http://creativecommons.org/licenses/by-nc-nd/3.0/ for
## more information.
## **********************************************************
## 
## 
## input file = WHO.csv
## analysis style = mv=1
## results file name = 'WHO.csv,mv=1,cv=0.0,B=n^0.6,Results.csv'
## print status frequency = every 100 variable pairs
## status file name = 'WHO.csv,mv=1,cv=0.0,B=n^0.6,Status.txt'
## alpha = 0.6
## numClumpsFactor = 15.0
## debug level = 0
## required common values fraction = 0.0
## garbage collection forced every 2147483647 variable pairs
## reading in dataset...
## done.
## Analyzing...
## 1 calculating: CountryID vs Country...
## 101 calculating: CountryID vs Under-5 mortality rate (Probability of dying aged < 5 years per 1 000 live births) ratio rural-urban...
## 201 calculating: CountryID vs Coal_consumption_per_person...
## 301 calculating: CountryID vs Nuclear_consumption_per_person...
## 357 variable pairs analyzed.
## Sorting results in descending order...
## done. printing results
## Analysis finished. See file "WHO.csv,mv=1,cv=0.0,B=n^0.6,Results.csv" for output

Show the results.

res = read.csv('WHO.csv,mv=1,cv=0.0,B=n^0.6,Results.csv')
head(res)
##       X.var
## 1 CountryID
## 2 CountryID
## 3 CountryID
## 4 CountryID
## 5 CountryID
## 6 CountryID
##                                                                                                                             Y.var
## 1                                                    Antiretroviral therapy coverage among HIV-infected pregt women for PMTCT (%)
## 2                                                                                              Natural_gas_consumption_per_person
## 3 Under-5 mortality rate (Probability of dying aged < 5 years per 1 000 live births) difference lowest-highest wealth quintile
## 4                                                            Children aged 6-59 months who received vitamin A supplementation (%)
## 5                                                                                 Number of environment and public health workers
## 6                                                                              Number of community and traditional health workers
##   MIC..strength. MIC.p.2..nonlinearity. MAS..non.monotonicity.
## 1        0.41445              0.4090700             0.20515001
## 2        0.40137              0.3463985             0.11596000
## 3        0.38776              0.3801823             0.18960000
## 4        0.38666              0.2365068             0.10914001
## 5        0.37826              0.3639218             0.06795999
## 6        0.37770              0.3746602             0.05430001
##   MEV..functionality. MCN..complexity. Linear.regression..p.
## 1             0.35146         3.000000            0.07334831
## 2             0.32208         3.321928            0.23446003
## 3             0.38776         3.321928            0.08704989
## 4             0.20808         2.584963            0.38749608
## 5             0.30661         3.584963           -0.11974192
## 6             0.27875         3.321928            0.05513406

Another example,

MINE('WHO.csv', 1, 2)
## **********************************************************
## MINE version 1.0.1d
## Copyright 2011 by David Reshef and Yakir Reshef.
## 
## This application is licensed under a Creative Commons
## Attribution-NonCommercial-NoDerivs 3.0 Unported License.
## See
## http://creativecommons.org/licenses/by-nc-nd/3.0/ for
## more information.
## **********************************************************
## 
## 
## input file = WHO.csv
## analysis style = 1-vs-2
## results file name = 'WHO.csv,1-vs-2,cv=0.0,B=n^0.6,Results.csv'
## print status frequency = every 100 variable pairs
## status file name = 'WHO.csv,1-vs-2,cv=0.0,B=n^0.6,Status.txt'
## alpha = 0.6
## numClumpsFactor = 15.0
## debug level = 0
## required common values fraction = 0.0
## garbage collection forced every 2147483647 variable pairs
## reading in dataset...
## done.
## Analyzing...
## 1 calculating: CountryID vs Continent...
## 1 variable pairs analyzed.
## Sorting results in descending order...
## done. printing results
## Analysis finished. See file "WHO.csv,1-vs-2,cv=0.0,B=n^0.6,Results.csv" for output
res = read.csv('WHO.csv,1-vs-2,cv=0.0,B=n^0.6,Results.csv')
head(res)
##       X.var     Y.var MIC..strength. MIC.p.2..nonlinearity.
## 1 CountryID Continent        0.20486              0.2046357
##   MAS..non.monotonicity. MEV..functionality. MCN..complexity.
## 1                0.14449             0.20486         4.459432
##   Linear.regression..p.
## 1            0.01497644

In fact, rMINE process the matrix and by row, rather than by column.

rMINE(matrix(1:10,2),"matrix",1)
## **********************************************************
## MINE version 1.0.1d
## Copyright 2011 by David Reshef and Yakir Reshef.
## 
## This application is licensed under a Creative Commons
## Attribution-NonCommercial-NoDerivs 3.0 Unported License.
## See
## http://creativecommons.org/licenses/by-nc-nd/3.0/ for
## more information.
## **********************************************************
## 
## 
## input file = matrix
## analysis style = mv=1
## results file name = 'matrix,mv=1,cv=0.0,B=n^0.6,Results.csv'
## print status frequency = every 100 variable pairs
## status file name = 'matrix,mv=1,cv=0.0,B=n^0.6,Status.txt'
## alpha = 0.6
## numClumpsFactor = 15.0
## debug level = 0
## required common values fraction = 0.0
## garbage collection forced every 2147483647 variable pairs
## reading in dataset...
## done.
## Analyzing...
## 1 calculating: variable 2 vs variable 1...
## 1 variable pairs analyzed.
## Sorting results in descending order...
## done. printing results
## Analysis finished. See file "matrix,mv=1,cv=0.0,B=n^0.6,Results.csv" for output

Show the results.

res = read.csv('matrix,mv=0,cv=0.0,B=n^0.6,Results.csv')
head(res)
##        X.var      Y.var MIC..strength. MIC.p.2..nonlinearity.
## 1 variable 1 variable 2              1                      0
## 2 variable 1 variable 3              1                      0
##   MAS..non.monotonicity. MEV..functionality. MCN..complexity.
## 1                      0       -3.402823e+38                2
## 2                      0       -3.402823e+38                2
##   Linear.regression..p.
## 1                     1
## 2                     1

Gene Expression

Spellman et al. yeast gene expression dataset (CDC15).

data_Spellman = read.csv('Spellman.csv')
nrow(data_Spellman) # 4381 row (time)
## [1] 4381
ncol(data_Spellman) # 24
## [1] 24
head(data_Spellman[1:5])
##      time    X40   X50    X60   X70
## 1 YAL001C -0.070 -0.23 -0.100  0.03
## 2 YAL014C  0.215  0.09  0.025 -0.04
## 3 YAL016W  0.150  0.15  0.220  0.29
## 4 YAL020C -0.350 -0.28 -0.215 -0.15
## 5 YAL022C -0.415 -0.59 -0.580 -0.57
## 6 YAL036C  0.540  0.33  0.215  0.10
colnames(data_Spellman)
##  [1] "time" "X40"  "X50"  "X60"  "X70"  "X80"  "X90"  "X100" "X110" "X120"
## [11] "X130" "X140" "X150" "X160" "X170" "X180" "X190" "X200" "X210" "X220"
## [21] "X230" "X240" "X250" "X260"

Microbiome Data Set

Microbiome dataset of Turnbaugh et al. (2009).

data_Microbinome_W = read.csv('MicrobiomeWithMetadata.csv')

nrow(data_Microbinome_W) # 675 row
## [1] 675
ncol(data_Microbinome_W) # 6701 col
## [1] 6701
data_Microbinome_W[1:6,1:6]
##   Diet Source Donor CollectionMet Sex     OTU0
## 1    0      0     0             0   0 1.56e-11
## 2    0      1     0             0   0 2.36e-11
## 3    0      2     0             1   0 6.77e-11
## 4    0      2     0             0   0 5.52e-11
## 5    0      3     0             0   0 5.24e-11
## 6    0      4     0             1   0 7.67e-11
data_Microbinome_N = read.csv('MicrobiomeNoMetadata.csv')
nrow(data_Microbinome_N) # 675 row
## [1] 675
ncol(data_Microbinome_N) # 6694 col
## [1] 6696
data_Microbinome_N[1:6,1:6]
##          OTU0        OTU1        OTU2        OTU3        OTU4        OTU5
## 1 1.55852e-11 4.72412e-11 1.22588e-11 4.51566e-11 2.71829e-11 2.70671e-11
## 2 2.35922e-11 9.53320e-11 3.33068e-11 2.66913e-11 2.01927e-11 2.50558e-11
## 3 6.77101e-11 3.68296e-11 8.01854e-11 5.48895e-11 1.34118e-11 5.11521e-11
## 4 5.52027e-11 9.89076e-11 4.58138e-11 3.53977e-11 2.09211e-11 6.25551e-11
## 5 5.23812e-11 6.34033e-11 2.35428e-11 7.46965e-11 2.48983e-11 4.04887e-13
## 6 7.66826e-11 7.21984e-11 5.40671e-11 1.20246e-11 4.53935e-11 1.27173e-11

Baseball

Individual offensive statistics from 2008 Major League Baseball season.

data_MLB = read.csv('MLB2008.csv')
nrow(data_MLB) # 337 player
## [1] 337
ncol(data_MLB) # 134 
## [1] 134
head(data_MLB[1:5])
##            PLAYER Record_ID.  SALARY ROOKIE POS
## 1      Gregg Zaun          1 3750000      0   2
## 2    Henry Blanco          2 3175000      0   2
## 3     Moises Alou          7 7500000      0   7
## 4 Corey Patterson          9 3000000      0   8
## 5     Rod Barajas         10  700000      0   2
## 6    Rich Aurilia         12 4500000      0   3

p value table

pre-processing

data_p = read.csv('n=60,alpha=0.6.csv', stringsAsFactors = FALSE)
newp = data_p[11:nrow(data_p),1:3]
colnames(newp) = data_p[10,1:3]
rownames(newp) = c(1:nrow(newp))
head(newp)
##   MIC >= x gives p <= y +/- (with 95% confidence)
## 1 0.669070  0.000000259               0.000000183
## 2 0.655790  0.000000517               0.000000259
## 3 0.648460  0.000000776               0.000000317
## 4 0.644890  0.000001035               0.000000366
## 5 0.639030  0.000001293               0.000000409
## 6 0.631230  0.000001552               0.000000448
nrow(newp)
## [1] 14432
ncol(newp)
## [1] 3
colnames(newp)
## [1] "MIC >= x"                  "gives p <= y"             
## [3] "+/- (with 95% confidence)"
#as.numeric(newp$`MIC >= x`)
#as.numeric(newp$`gives p <= y`)
#as.numeric(newp$`+/- (with 95% confidence)`)
write.csv(newp, file = 'newp(n=60,alpha=0.6).csv')

New words

  1. curation
  2. textual