#install.packages('rJava')
source('MINE.r')
Social, economic, health, and political indicators from the World Health Organization and GapMinder.
data_WHO = read.csv('WHO.csv')
nrow(data_WHO) # 202 country
## [1] 202
ncol(data_WHO) # 358 indicators
## [1] 358
data_WHO[1:6,1:5]
## Country CountryID Continent Adolescent.fertility.rate....
## 1 Afghanistan 1 1 151
## 2 Albania 2 2 27
## 3 Algeria 3 3 6
## 4 Andorra 4 2 NA
## 5 Angola 5 3 146
## 6 Antigua and Barbuda 6 4 NA
## Adult.literacy.rate....
## 1 28.0
## 2 98.7
## 3 69.9
## 4 NA
## 5 67.4
## 6 NA
MINE('WHO.csv', 1)
## **********************************************************
## MINE version 1.0.1d
## Copyright 2011 by David Reshef and Yakir Reshef.
##
## This application is licensed under a Creative Commons
## Attribution-NonCommercial-NoDerivs 3.0 Unported License.
## See
## http://creativecommons.org/licenses/by-nc-nd/3.0/ for
## more information.
## **********************************************************
##
##
## input file = WHO.csv
## analysis style = mv=1
## results file name = 'WHO.csv,mv=1,cv=0.0,B=n^0.6,Results.csv'
## print status frequency = every 100 variable pairs
## status file name = 'WHO.csv,mv=1,cv=0.0,B=n^0.6,Status.txt'
## alpha = 0.6
## numClumpsFactor = 15.0
## debug level = 0
## required common values fraction = 0.0
## garbage collection forced every 2147483647 variable pairs
## reading in dataset...
## done.
## Analyzing...
## 1 calculating: CountryID vs Country...
## 101 calculating: CountryID vs Under-5 mortality rate (Probability of dying aged < 5 years per 1 000 live births) ratio rural-urban...
## 201 calculating: CountryID vs Coal_consumption_per_person...
## 301 calculating: CountryID vs Nuclear_consumption_per_person...
## 357 variable pairs analyzed.
## Sorting results in descending order...
## done. printing results
## Analysis finished. See file "WHO.csv,mv=1,cv=0.0,B=n^0.6,Results.csv" for output
Show the results.
res = read.csv('WHO.csv,mv=1,cv=0.0,B=n^0.6,Results.csv')
head(res)
## X.var
## 1 CountryID
## 2 CountryID
## 3 CountryID
## 4 CountryID
## 5 CountryID
## 6 CountryID
## Y.var
## 1 Antiretroviral therapy coverage among HIV-infected pregt women for PMTCT (%)
## 2 Natural_gas_consumption_per_person
## 3 Under-5 mortality rate (Probability of dying aged < 5 years per 1 000 live births) difference lowest-highest wealth quintile
## 4 Children aged 6-59 months who received vitamin A supplementation (%)
## 5 Number of environment and public health workers
## 6 Number of community and traditional health workers
## MIC..strength. MIC.p.2..nonlinearity. MAS..non.monotonicity.
## 1 0.41445 0.4090700 0.20515001
## 2 0.40137 0.3463985 0.11596000
## 3 0.38776 0.3801823 0.18960000
## 4 0.38666 0.2365068 0.10914001
## 5 0.37826 0.3639218 0.06795999
## 6 0.37770 0.3746602 0.05430001
## MEV..functionality. MCN..complexity. Linear.regression..p.
## 1 0.35146 3.000000 0.07334831
## 2 0.32208 3.321928 0.23446003
## 3 0.38776 3.321928 0.08704989
## 4 0.20808 2.584963 0.38749608
## 5 0.30661 3.584963 -0.11974192
## 6 0.27875 3.321928 0.05513406
Another example,
MINE('WHO.csv', 1, 2)
## **********************************************************
## MINE version 1.0.1d
## Copyright 2011 by David Reshef and Yakir Reshef.
##
## This application is licensed under a Creative Commons
## Attribution-NonCommercial-NoDerivs 3.0 Unported License.
## See
## http://creativecommons.org/licenses/by-nc-nd/3.0/ for
## more information.
## **********************************************************
##
##
## input file = WHO.csv
## analysis style = 1-vs-2
## results file name = 'WHO.csv,1-vs-2,cv=0.0,B=n^0.6,Results.csv'
## print status frequency = every 100 variable pairs
## status file name = 'WHO.csv,1-vs-2,cv=0.0,B=n^0.6,Status.txt'
## alpha = 0.6
## numClumpsFactor = 15.0
## debug level = 0
## required common values fraction = 0.0
## garbage collection forced every 2147483647 variable pairs
## reading in dataset...
## done.
## Analyzing...
## 1 calculating: CountryID vs Continent...
## 1 variable pairs analyzed.
## Sorting results in descending order...
## done. printing results
## Analysis finished. See file "WHO.csv,1-vs-2,cv=0.0,B=n^0.6,Results.csv" for output
res = read.csv('WHO.csv,1-vs-2,cv=0.0,B=n^0.6,Results.csv')
head(res)
## X.var Y.var MIC..strength. MIC.p.2..nonlinearity.
## 1 CountryID Continent 0.20486 0.2046357
## MAS..non.monotonicity. MEV..functionality. MCN..complexity.
## 1 0.14449 0.20486 4.459432
## Linear.regression..p.
## 1 0.01497644
In fact, rMINE process the matrix and by row, rather than by column.
rMINE(matrix(1:10,2),"matrix",1)
## **********************************************************
## MINE version 1.0.1d
## Copyright 2011 by David Reshef and Yakir Reshef.
##
## This application is licensed under a Creative Commons
## Attribution-NonCommercial-NoDerivs 3.0 Unported License.
## See
## http://creativecommons.org/licenses/by-nc-nd/3.0/ for
## more information.
## **********************************************************
##
##
## input file = matrix
## analysis style = mv=1
## results file name = 'matrix,mv=1,cv=0.0,B=n^0.6,Results.csv'
## print status frequency = every 100 variable pairs
## status file name = 'matrix,mv=1,cv=0.0,B=n^0.6,Status.txt'
## alpha = 0.6
## numClumpsFactor = 15.0
## debug level = 0
## required common values fraction = 0.0
## garbage collection forced every 2147483647 variable pairs
## reading in dataset...
## done.
## Analyzing...
## 1 calculating: variable 2 vs variable 1...
## 1 variable pairs analyzed.
## Sorting results in descending order...
## done. printing results
## Analysis finished. See file "matrix,mv=1,cv=0.0,B=n^0.6,Results.csv" for output
Show the results.
res = read.csv('matrix,mv=0,cv=0.0,B=n^0.6,Results.csv')
head(res)
## X.var Y.var MIC..strength. MIC.p.2..nonlinearity.
## 1 variable 1 variable 2 1 0
## 2 variable 1 variable 3 1 0
## MAS..non.monotonicity. MEV..functionality. MCN..complexity.
## 1 0 -3.402823e+38 2
## 2 0 -3.402823e+38 2
## Linear.regression..p.
## 1 1
## 2 1
Spellman et al. yeast gene expression dataset (CDC15).
data_Spellman = read.csv('Spellman.csv')
nrow(data_Spellman) # 4381 row (time)
## [1] 4381
ncol(data_Spellman) # 24
## [1] 24
head(data_Spellman[1:5])
## time X40 X50 X60 X70
## 1 YAL001C -0.070 -0.23 -0.100 0.03
## 2 YAL014C 0.215 0.09 0.025 -0.04
## 3 YAL016W 0.150 0.15 0.220 0.29
## 4 YAL020C -0.350 -0.28 -0.215 -0.15
## 5 YAL022C -0.415 -0.59 -0.580 -0.57
## 6 YAL036C 0.540 0.33 0.215 0.10
colnames(data_Spellman)
## [1] "time" "X40" "X50" "X60" "X70" "X80" "X90" "X100" "X110" "X120"
## [11] "X130" "X140" "X150" "X160" "X170" "X180" "X190" "X200" "X210" "X220"
## [21] "X230" "X240" "X250" "X260"
Microbiome dataset of Turnbaugh et al. (2009).
data_Microbinome_W = read.csv('MicrobiomeWithMetadata.csv')
nrow(data_Microbinome_W) # 675 row
## [1] 675
ncol(data_Microbinome_W) # 6701 col
## [1] 6701
data_Microbinome_W[1:6,1:6]
## Diet Source Donor CollectionMet Sex OTU0
## 1 0 0 0 0 0 1.56e-11
## 2 0 1 0 0 0 2.36e-11
## 3 0 2 0 1 0 6.77e-11
## 4 0 2 0 0 0 5.52e-11
## 5 0 3 0 0 0 5.24e-11
## 6 0 4 0 1 0 7.67e-11
data_Microbinome_N = read.csv('MicrobiomeNoMetadata.csv')
nrow(data_Microbinome_N) # 675 row
## [1] 675
ncol(data_Microbinome_N) # 6694 col
## [1] 6696
data_Microbinome_N[1:6,1:6]
## OTU0 OTU1 OTU2 OTU3 OTU4 OTU5
## 1 1.55852e-11 4.72412e-11 1.22588e-11 4.51566e-11 2.71829e-11 2.70671e-11
## 2 2.35922e-11 9.53320e-11 3.33068e-11 2.66913e-11 2.01927e-11 2.50558e-11
## 3 6.77101e-11 3.68296e-11 8.01854e-11 5.48895e-11 1.34118e-11 5.11521e-11
## 4 5.52027e-11 9.89076e-11 4.58138e-11 3.53977e-11 2.09211e-11 6.25551e-11
## 5 5.23812e-11 6.34033e-11 2.35428e-11 7.46965e-11 2.48983e-11 4.04887e-13
## 6 7.66826e-11 7.21984e-11 5.40671e-11 1.20246e-11 4.53935e-11 1.27173e-11
Individual offensive statistics from 2008 Major League Baseball season.
data_MLB = read.csv('MLB2008.csv')
nrow(data_MLB) # 337 player
## [1] 337
ncol(data_MLB) # 134
## [1] 134
head(data_MLB[1:5])
## PLAYER Record_ID. SALARY ROOKIE POS
## 1 Gregg Zaun 1 3750000 0 2
## 2 Henry Blanco 2 3175000 0 2
## 3 Moises Alou 7 7500000 0 7
## 4 Corey Patterson 9 3000000 0 8
## 5 Rod Barajas 10 700000 0 2
## 6 Rich Aurilia 12 4500000 0 3
pre-processing
data_p = read.csv('n=60,alpha=0.6.csv', stringsAsFactors = FALSE)
newp = data_p[11:nrow(data_p),1:3]
colnames(newp) = data_p[10,1:3]
rownames(newp) = c(1:nrow(newp))
head(newp)
## MIC >= x gives p <= y +/- (with 95% confidence)
## 1 0.669070 0.000000259 0.000000183
## 2 0.655790 0.000000517 0.000000259
## 3 0.648460 0.000000776 0.000000317
## 4 0.644890 0.000001035 0.000000366
## 5 0.639030 0.000001293 0.000000409
## 6 0.631230 0.000001552 0.000000448
nrow(newp)
## [1] 14432
ncol(newp)
## [1] 3
colnames(newp)
## [1] "MIC >= x" "gives p <= y"
## [3] "+/- (with 95% confidence)"
#as.numeric(newp$`MIC >= x`)
#as.numeric(newp$`gives p <= y`)
#as.numeric(newp$`+/- (with 95% confidence)`)
write.csv(newp, file = 'newp(n=60,alpha=0.6).csv')