-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathreference.bib
executable file
·2674 lines (2500 loc) · 150 KB
/
reference.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%% This BibTeX bibliography file was created using BibDesk.
%% http://bibdesk.sourceforge.net/
%% Created for Renato Vimieiro at 2017-03-07 18:41:38 -0300
%% Saved with string encoding Unicode (UTF-8)
@misc{wall2014,
Author = {Matthew Wall},
Date-Added = {2017-03-07 21:40:00 +0000},
Date-Modified = {2017-03-07 21:41:36 +0000},
Howpublished = {http://www.bbc.com/news/business-26383058},
Keywords = {Big Data},
Month = {March},
Title = {Big Data: Are you ready for blast-off?},
Year = {2014}}
@incollection{ramakrishnan09,
Author = {Naren Ramakrishnan and Mohammed Zaki},
Booktitle = {Biological Data Mining},
Chapter = {22},
Date-Added = {2012-07-16 12:43:47 +1000},
Date-Modified = {2012-07-16 12:52:27 +1000},
Editor = {Jake Chen and Stefano Lonardi},
Keywords = {redescription mining, minimal generators},
Pages = {561--586},
Publisher = {CRC Press},
Series = {Chapman \& Hall/CRC Data Mining and Knowledge Discovery Series},
Title = {{Redescription Mining and Applications in Bioinformatics}},
Year = {2009}}
@phdthesis{kumar07,
Author = {D Kumar},
Date-Added = {2012-07-16 12:32:41 +1000},
Date-Modified = {2012-07-16 12:33:57 +1000},
Keywords = {redescription mining},
School = {Virginia Tech},
Title = {Redescription mining: algorithms and applications in {B}ioinformatics},
Year = {2007}}
@inproceedings{zaki05,
Acmid = {1081912},
Address = {New York, NY, USA},
Author = {Zaki, Mohammed J. and Ramakrishnan, Naren},
Booktitle = {Proceedings of the Eleventh ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
Date-Added = {2012-07-16 12:25:37 +1000},
Date-Modified = {2012-07-16 12:27:33 +1000},
Doi = {10.1145/1081870.1081912},
Isbn = {1-59593-135-X},
Keywords = {closed itemsets, data mining, minimal generators, redescription mining},
Location = {Chicago, Illinois, USA},
Numpages = {10},
Pages = {364--373},
Publisher = {ACM},
Series = {KDD '05},
Title = {Reasoning about sets using redescription mining},
Url = {http://doi.acm.org/10.1145/1081870.1081912},
Year = {2005},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/1081870.1081912},
Bdsk-Url-2 = {http://dx.doi.org/10.1145/1081870.1081912}}
@inproceedings{parida05,
Acmid = {1619467},
Author = {Parida, Laxmi and Ramakrishnan, Naren},
Booktitle = {Proceedings of the 20th National Conference on Artificial Intelligence},
Date-Added = {2012-07-16 12:23:44 +1000},
Date-Modified = {2012-07-16 12:24:44 +1000},
Isbn = {1-57735-236-x},
Keywords = {redescription mining},
Location = {Pittsburgh, Pennsylvania},
Numpages = {8},
Pages = {837--844},
Publisher = {AAAI Press},
Series = {AAAI'05},
Title = {Redescription mining: structure theory and algorithms},
Url = {http://dl.acm.org/citation.cfm?id=1619410.1619467},
Volume = {2},
Year = {2005},
Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=1619410.1619467}}
@article{webb07,
Abstract = {Pattern discovery techniques, such as association rule discovery, explore large search spaces of potential patterns to find those that satisfy some user-specified constraints. Due to the large number of patterns considered, they suffer from an extreme risk of type-1 error, that is, of finding patterns that appear due to chance alone to satisfy the constraints on the sample data. This paper proposes techniques to overcome this problem by applying well-established statistical practices. These allow the user to enforce a strict upper limit on the risk of experimentwise error. Empirical studies demonstrate that standard pattern discovery techniques can discover numerous spurious patterns when applied to random data and when applied to real-world data result in large numbers of patterns that are rejected when subjected to sound statistical evaluation. They also reveal that a number of pragmatic choices about how such tests are performed can greatly affect their power.},
Affiliation = {Monash University Faculty of Information Technology PO Box 75 Clayton Vic. 3800 Australia},
Author = {Webb, Geoffrey},
Date-Added = {2012-07-12 11:24:46 +1000},
Date-Modified = {2012-07-12 12:02:31 +1000},
Issn = {0885-6125},
Issue = {1},
Journal = {Machine Learning},
Keyword = {Computer Science},
Keywords = {interestingness measure, frequent itemset mining, association rules},
Pages = {1-33},
Publisher = {Springer Netherlands},
Title = {Discovering Significant Patterns},
Url = {http://dx.doi.org/10.1007/s10994-007-5006-x},
Volume = {68},
Year = {2007},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/s10994-007-5006-x}}
@inproceedings{hamalainen10,
Abstract = {Statistical dependency analysis is the basis of all empirical science. A commonly occurring problem is to find the most significant dependency rules, which describe either positive or negative dependencies between categorical attributes. For example, in medical science one is interested in genetic factors, which can either predispose or prevent diseases. The requirement of statistical significance is essential, because the discoveries should hold also in the future data. Typically, the significance is estimated either by Fisher's exact test or the #x03C7;2-measure. The problem is computationally very difficult, because the number of all possible dependency rules increases exponentially with the number of attributes. As a solution, different kinds of restrictions and heuristics have been applied, but a general, scalable search method has been missing. In this paper, we introduce an efficient algorithm for searching for the top-K globally optimal dependency rules using Fisher's exact test as a measure function. The rules can express either positive or negative dependencies between a set of positive attributes and a single consequent attribute. The algorithm is based on an application of the branch- and-bound search strategy, supplemented by several pruning properties. Especially, we prove a new lower-bound for the Fisher's p, and introduce a new effective pruning principle. The general search algorithm is applicable to other goodness measures, like the #x03C7;2-measure, as well. According to our experiments on classical benchmark data, the algorithm is well scalable and can efficiently handle even dense and high dimensional data sets. In addition, the quality of rules is significantly better than with the #x03C7;2-measure using the same search algorithm.},
Author = {H\"am\"al\"ainen, Wilhelmiina},
Booktitle = {Proceedings of the IEEE 10th International Conference on Data Mining (ICDM 2010)},
Date-Added = {2012-07-12 11:18:51 +1000},
Date-Modified = {2012-07-12 11:21:00 +1000},
Doi = {10.1109/ICDM.2010.143},
Issn = {1550-4786},
Keywords = {Fisher exact test; data mining; p-value; frequent pattern mining, interestingness measure},
Pages = {196 -205},
Title = {Efficient Discovery of the Top-K Optimal Dependency Rules with Fisher's Exact Test of Significance},
Year = {2010},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/ICDM.2010.143}}
@article{hamalainen12,
Abstract = {Statistical dependency analysis is the basis of all empirical science. A commonly occurring problem is to find the most significant dependency rules, which describe either positive or negative dependencies between categorical attributes. In medical science, for example, one is interested in genetic factors, which can either predispose or prevent diseases. The requirement of statistical significance is essential, because the discoveries should hold also in future data. Typically, the significance is estimated either by Fisher's exact test or the χ 2 -measure. The problem is computationally very difficult, because the number of all possible dependency rules increases exponentially with the number of attributes. As a solution, different kinds of restrictions and heuristics have been applied, but a general, scalable search method has been missing. In this paper, we introduce an efficient algorithm, called Kingfisher, for searching for the best non-redundant dependency rules with statistical significance measures. The rules can express either positive or negative dependencies between a set of positive attributes and a single consequent attribute. The algorithm itself is independent from the used goodness measure, but we concentrate on Fisher's exact test and the χ 2 -measure. The algorithm is based on an application of the branch-and-bound search strategy, supplemented by several pruning properties. Especially, we prove a new lower bound for Fisher's p and introduce a new effective pruning principle. According to our experiments on classical benchmark data, the algorithm is well scalable and can efficiently handle even dense and high-dimensional data sets. An interesting observation was that Fisher's exact test did not only produce more reliable rules than the χ 2 -measure, but it also performed the search much faster.},
Affiliation = {Department of Biosciences, University of Eastern Finland, Joensuu, Finland},
Author = {H\"am\"al\"ainen, Wilhelmiina},
Date-Added = {2012-07-12 11:15:10 +1000},
Date-Modified = {2012-07-12 11:55:08 +1000},
Issn = {0219-1377},
Issue = {2},
Journal = {Knowledge and Information Systems},
Keyword = {Computer Science},
Keywords = {frequent itemset mining, p-value, interestingness measure},
Pages = {383-414},
Publisher = {Springer London},
Title = {Kingfisher: an efficient algorithm for searching for both positive and negative dependency rules with statistical significance measures},
Url = {http://dx.doi.org/10.1007/s10115-011-0432-2},
Volume = {32},
Year = {2012},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/s10115-011-0432-2}}
@inproceedings{tan02,
Acmid = {775053},
Address = {New York, NY, USA},
Author = {Tan, Pang-Ning and Kumar, Vipin and Srivastava, Jaideep},
Booktitle = {Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data M},
Date-Added = {2012-07-12 10:58:36 +1000},
Date-Modified = {2012-07-12 10:59:53 +1000},
Doi = {10.1145/775047.775053},
Isbn = {1-58113-567-X},
Keywords = {association rules, interestingness measure, frequent itemset mining},
Location = {Edmonton, Alberta, Canada},
Numpages = {10},
Pages = {32--41},
Publisher = {ACM},
Series = {KDD '02},
Title = {Selecting the right interestingness measure for association patterns},
Url = {http://doi.acm.org/10.1145/775047.775053},
Year = {2002},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/775047.775053},
Bdsk-Url-2 = {http://dx.doi.org/10.1145/775047.775053}}
@inproceedings{li08,
Address = {New York, NY, USA},
Author = {Li, Haoyuan and Wang, Yi and Zhang, Dong and Zhang, Ming and Chang, Edward Y.},
Booktitle = {Proceedings of the 2008 ACM Conference on Recommender Systems},
Date-Added = {2012-07-12 10:20:50 +1000},
Date-Modified = {2012-07-12 10:22:02 +1000},
Doi = {10.1145/1454008.1454027},
Isbn = {978-1-60558-093-7},
Keywords = {data mining, frequent itemset mining, parallel frequent pattern mining},
Location = {Lausanne, Switzerland},
Numpages = {8},
Pages = {107--114},
Publisher = {ACM},
Series = {RecSys '08},
Title = {Pfp: parallel fp-growth for query recommendation},
Url = {http://doi.acm.org/10.1145/1454008.1454027},
Year = {2008},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/1454008.1454027},
Bdsk-Url-2 = {http://dx.doi.org/10.1145/1454008.1454027}}
@article{agarwal01,
Abstract = {In this paper we propose algorithms for generation of frequent item sets by successive construction of the nodes of a lexicographic tree of item sets. We discuss different strategies in generation and traversal of the lexicographic tree such as breadth-first search, depth-first search, or a combination of the two. These techniques provide different trade-offs in terms of the I/O, memory, and computational time requirements. We use the hierarchical structure of the lexicographic tree to successively project transactions at each node of the lexicographic tree and use matrix counting on this reduced set of transactions for finding frequent item sets. We tested our algorithm on both real and synthetic data. We provide an implementation of the tree projection method which is up to one order of magnitude faster than other recent techniques in the literature. The algorithm has a well-structured data access pattern which provides data locality and reuse of data for multiple levels of the cache. We also discuss methods for parallelization of the TreeProjection algorithm.},
Author = {Ramesh C. Agarwal and Charu C. Aggarwal and V.V.V. Prasad},
Date-Added = {2012-07-11 19:07:55 +1000},
Date-Modified = {2012-07-11 19:08:29 +1000},
Doi = {10.1006/jpdc.2000.1693},
Issn = {0743-7315},
Journal = {Journal of Parallel and Distributed Computing},
Keywords = {frequent itemset mining, data mining},
Number = {3},
Pages = {350 - 371},
Title = {A Tree Projection Algorithm for Generation of Frequent Item Sets},
Url = {http://www.sciencedirect.com/science/article/pii/S0743731500916939},
Volume = {61},
Year = {2001},
Bdsk-Url-1 = {http://www.sciencedirect.com/science/article/pii/S0743731500916939},
Bdsk-Url-2 = {http://dx.doi.org/10.1006/jpdc.2000.1693}}
@inproceedings{orlando02,
Abstract = { The performance of an algorithm that mines frequent sets from transactional databases may severely depend on the specific features of the data being analyzed. Moreover, some architectural characteristics of the computational platform used - e.g. the available main memory - can dramatically change its runtime behavior. In this paper we present DCI (Direct Count Intersect), an efficient algorithm for discovering frequent sets from large databases. Due to the multiple heuristics strategies adopted, DCI can adapt its behavior not only to the features of the specific computing platform, but also to the features of the dataset being mined, so that it results very effective in mining both short and long patterns from sparse and dense datasets. Finally we also discuss the parallelization strategies adopted in the design of ParDCI, a distributed and multi-threaded implementation of DCI.},
Author = {Orlando, S. and Palmerini, P. and Perego, R. and Silvestri, F.},
Booktitle = {Proceedings of the IEEE International Conference on Data Mining (ICDM 2002)},
Date-Added = {2012-07-11 19:00:00 +1000},
Date-Modified = {2012-07-11 19:03:47 +1000},
Doi = {10.1109/ICDM.2002.1183921},
Keywords = {frequent sets; data mining; frequent itemset mining},
Pages = {338 - 345},
Title = {Adaptive and resource-aware mining of frequent sets},
Year = {2002},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/ICDM.2002.1183921}}
@inproceedings{teodoro10,
Abstract = {Frequent itemset mining (FIM) is a core operation for several data mining applications as association rules computation, correlations, document classification, and many others, which has been extensively studied over the last decades. Moreover, databases are becoming increasingly larger, thus requiring a higher computing power to mine them in reasonable time. At the same time, the advances in high performance computing platforms are transforming them into hierarchical parallel environments equipped with multi-core processors and many-core accelerators, such as GPUs. Thus, fully exploiting these systems to perform FIM tasks poses as a challenging and critical problem that we address in this paper. We present efficient multi-core and GPU accelerated parallelizations of the Tree Projection, one of the most competitive FIM algorithms. The experimental results show that our Tree Projection implementation scales almost linearly in a CPU shared-memory environment after careful optimizations, while the GPU versions are up to 173 times faster than standard the CPU version.},
Author = {Teodoro, G. and Mariano, N. and Meira, W. and Ferreira, R.},
Booktitle = {22nd International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)},
Date-Added = {2012-07-11 18:50:26 +1000},
Date-Modified = {2012-07-12 12:01:52 +1000},
Doi = {10.1109/SBAC-PAD.2010.15},
Issn = {1550-6533},
Keywords = {association rules; frequent itemset mining; GPU;data mining},
Pages = {47 -54},
Title = {Tree Projection-Based Frequent Itemset Mining on Multicore CPUs and GPUs},
Year = {2010},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/SBAC-PAD.2010.15}}
@inproceedings{silvestri12,
Address = {Los Alamitos, CA, USA},
Author = {Claudio Silvestri and Salvatore Orlando},
Booktitle = {Euromicro Conference on Parallel, Distributed, and Network-Based Processing},
Date-Added = {2012-07-11 18:36:54 +1000},
Date-Modified = {2012-07-11 18:38:59 +1000},
Doi = {http://doi.ieeecomputersociety.org/10.1109/PDP.2012.94},
Issn = {1066-6192},
Keywords = {GPU, closed itemset mining, frequent closed pattern mining, frequent itemset mining},
Pages = {416-425},
Publisher = {IEEE Computer Society},
Title = {{gpuDCI: Exploiting GPUs in Frequent Itemset Mining}},
Year = {2012},
Bdsk-Url-1 = {http://doi.ieeecomputersociety.org/10.1109/PDP.2012.94}}
@techreport{fang08,
Author = {Wenbin Fang and Ka Keung Lau and Mian Lu and Xiangye Xiao and Chi Kit Lam and Philip Yang Yang and Bingsheng He and Qiong Luo and Pedro V. Sander and Ke Yang},
Date-Added = {2012-07-11 18:24:03 +1000},
Date-Modified = {2012-07-11 18:27:32 +1000},
Institution = {Hong Kong University of Science and Technology},
Keywords = {GPU, frequent itemset mining, frequent pattern mining},
Number = {HKUST-CS08-07},
Title = {Parallel data mining on graphics processors},
Year = {2008}}
@inproceedings{fang09,
Address = {New York, NY, USA},
Author = {Fang, Wenbin and Lu, Mian and Xiao, Xiangye and He, Bingsheng and Luo, Qiong},
Booktitle = {Proceedings of the Fifth International Workshop on Data Management on New Hardware},
Date-Added = {2012-07-11 18:21:01 +1000},
Date-Modified = {2012-07-11 18:22:31 +1000},
Doi = {10.1145/1565694.1565702},
Isbn = {978-1-60558-701-1},
Keywords = {GPU, frequent itemset mining, frequent pattern mining},
Location = {Providence, Rhode Island},
Numpages = {9},
Pages = {34--42},
Publisher = {ACM},
Series = {DaMoN '09},
Title = {Frequent itemset mining on graphics processors},
Url = {http://doi.acm.org/10.1145/1565694.1565702},
Year = {2009},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/1565694.1565702},
Bdsk-Url-2 = {http://dx.doi.org/10.1145/1565694.1565702}}
@article{berretta2007,
Author = {Berretta, R. and Mendes, A. and Moscato, P.},
Date-Added = {2012-06-12 15:07:07 +1000},
Date-Modified = {2012-06-12 15:09:11 +1000},
Journal = {Journal of Research and Practice in Information Technology},
Keywords = {abk feature selection},
Number = {4},
Pages = {287--299},
Publisher = {Australian Computer Society},
Title = {Selection of discriminative genes in microarray experiments using mathematical programming},
Volume = {39},
Year = {2007}}
@incollection{cotta04,
Abstract = {We deal with two important problems in pattern recognition that arise in the analysis of large datasets. While most feature subset selection methods use statistical techniques to preprocess the labeled datasets, these methods are generally not linked with the combinatorial properties of the final solutions. We prove that it is NP -hard to obtain an appropriate set of thresholds that will transform a given dataset into a binary instance of a robust feature subset selection problem. We address this problem using an evolutionary algorithm that learns the appropriate value of the thresholds. The empirical evaluation shows that robust subset of genes can be obtained. This evaluation is done using real data corresponding to the gene expression of lymphomas.},
Author = {Cotta, Carlos and Sloper, Christian and Moscato, Pablo},
Booktitle = {Applications of Evolutionary Computing},
Date-Added = {2012-06-12 14:57:53 +1000},
Date-Modified = {2012-06-12 14:59:07 +1000},
Editor = {Raidl, G\"unther and Cagnoni, Stefano and Branke, J\"urgen and Corne, David and Drechsler, Rolf and Jin, Yaochu and Johnson, Colin and Machado, Penousal and Marchiori, Elena and Rothlauf, Franz and Smith, George and Squillero, Giovanni},
Isbn = {978-3-540-21378-9},
Keywords = {abk feature selection},
Pages = {21-30},
Publisher = {Springer Berlin / Heidelberg},
Series = {Lecture Notes in Computer Science},
Title = {Evolutionary Search of Thresholds for Robust Feature Set Selection: Application to the Analysis of Microarray Data},
Url = {http://dx.doi.org/10.1007/978-3-540-24653-4_3},
Volume = {3005},
Year = {2004},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-540-24653-4_3}}
@inproceedings{moscato2005,
Author = {Moscato, P. and Mathieson, L. and Mendes, A. and Berretta, R.},
Booktitle = {Proceedings of the Twenty-eighth Australasian conference on Computer Science-Volume 38},
Date-Added = {2012-06-12 14:53:51 +1000},
Date-Modified = {2012-06-12 14:55:09 +1000},
Keywords = {abk feature selection},
Organization = {Australian Computer Society},
Pages = {371--379},
Title = {The Electronic Primaries: Predicting the {US} presidency using feature selection with safe data reduction},
Year = {2005}}
@incollection{ravetti09,
Abstract = {In this chapter we present a method based on the ( α , β )- k -feature set problem for identifying relevant attributes in high-dimensional datasets for classification purposes. We present a case-study of biomedical interest. Using the gene expression of thousands of genes, we show that the method can give a reduced set that can identify samples as belonging to prostate cancer tumors or not. We thus address the need of finding novel methods that can deal with classification problems that involve feature selection from several thousand features, while we only have on the order of one hundred samples. The methodology appears to be very robust in this prostate cancer case study. It has lead to the identification of a set of differentially expressed genes that are highly predictive of the cells transition to a more malignant type, thus departing from the profile which is characteristic of its originating tissue. Although the method is presented with a particular bioinformatics application in mind, it can clearly be used in other domains. A biological analysis illustrates on the relevance of the genes found, and links to the most current developments in prostate cancer biomarker studies.},
Affiliation = {Medicine, The University of Newcastle, Australian Research Council Centre of Excellence in Bioinformatics Centre for Bioinformatics, Biomarker Discovery and Information-Based Callaghan NSW 2308 Australia},
Author = {Ravetti, Mart{\'\i}n and Berretta, Regina and Moscato, Pablo},
Booktitle = {Foundations of Computational Intelligence Volume 5},
Date-Added = {2012-06-12 14:47:40 +1000},
Date-Modified = {2012-06-12 15:14:48 +1000},
Editor = {Abraham, Ajith and Hassanien, Aboul-Ella and Sn\'a\v{s}el, V\'aclav},
Isbn = {978-3-642-01535-9},
Keywords = {abk feature selection},
Pages = {149-175},
Publisher = {Springer Berlin / Heidelberg},
Series = {Studies in Computational Intelligence},
Title = {{Novel Biomarkers for Prostate Cancer Revealed by $\left(\alpha,\beta\right)$-$k$-Feature Sets}},
Url = {http://dx.doi.org/10.1007/978-3-642-01536-6_7},
Volume = {205},
Year = {2009},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-01536-6_7}}
@article{ravetti08,
Abstract = {<sec>
<title>Background</title>
<p>Alzheimer's disease (AD) is a progressive brain disease with a huge cost to human lives. The impact of the disease is also a growing concern for the governments of developing countries, in particular due to the increasingly high number of elderly citizens at risk. Alzheimer's is the most common form of dementia, a common term for memory loss and other cognitive impairments. There is no current cure for AD, but there are drug and non-drug based approaches for its treatment. In general the drug-treatments are directed at slowing the progression of symptoms. They have proved to be effective in a large group of patients but success is directly correlated with identifying the disease carriers at its early stages. This justifies the need for timely and accurate forms of diagnosis via molecular means. We report here a 5-protein biomarker molecular signature that achieves, on average, a 96% total accuracy in predicting clinical AD. The signature is composed of the abundances of IL-1α, IL-3, EGF, TNF-α and G-CSF.</p>
</sec><sec>
<title>Methodology/Principal Findings</title>
<p>Our results are based on a recent molecular dataset that has attracted worldwide attention. Our paper illustrates that improved results can be obtained with the abundance of only five proteins. Our methodology consisted of the application of an integrative data analysis method. This four step process included: a) abundance quantization, b) feature selection, c) literature analysis, d) selection of a classifier algorithm which is independent of the feature selection process. These steps were performed without using any sample of the test datasets. For the first two steps, we used the application of Fayyad and Irani's discretization algorithm for selection and quantization, which in turn creates an instance of the (alpha-beta)-k-Feature Set problem; a numerical solution of this problem led to the selection of only 10 proteins.</p>
</sec><sec>
<title>Conclusions/Significance</title>
<p>the previous study has provided an extremely useful dataset for the identification of AD biomarkers. However, our subsequent analysis also revealed several important facts worth reporting:</p>
<p>1. A 5-protein signature (which is a subset of the 18-protein signature of Ray <italic>et al.</italic>) has the same overall performance (when using the same classifier).</p>
<p>2. Using more than 20 different classifiers available in the widely-used Weka software package, our 5-protein signature has, on average, a smaller prediction error indicating the independence of the classifier and the robustness of this set of biomarkers (i.e. 96% accuracy when predicting AD against non-demented control).</p>
<p>3. Using very simple classifiers, like Simple Logistic or Logistic Model Trees, we have achieved the following results on 92 samples: 100 percent success to predict Alzheimer's Disease and 92 percent to predict Non Demented Control on the AD dataset.</p>
</sec>},
Author = {G\'omez Ravetti, Mart\'in AND Moscato, Pablo},
Date-Added = {2012-06-12 14:41:31 +1000},
Date-Modified = {2012-07-12 11:54:20 +1000},
Doi = {10.1371/journal.pone.0003111},
Journal = {PLoS ONE},
Keywords = {Alzheimer disease, abk feature selection},
Number = {9},
Pages = {e3111},
Publisher = {Public Library of Science},
Title = {Identification of a 5-Protein Biomarker Molecular Signature for Predicting Alzheimer's Disease},
Url = {http://dx.doi.org/10.1371%2Fjournal.pone.0003111},
Volume = {3},
Year = {2008},
Bdsk-Url-1 = {http://dx.doi.org/10.1371/journal.pone.0003111}}
@book{hey09,
Abstract = {{A collection of essays expanding on the vision of pioneering computer scientist Jim Gray for a new, fourth paradigm of discovery based on data-intensive science.}},
Author = {Hey, Anthony J. G. and Tansley, Stewart and Tolle, Kristin M.},
Booktitle = {The Fourth Paradigm: Data-Intensive Scientific Discovery},
Citeulike-Article-Id = {6347440},
Citeulike-Linkout-0 = {http://research.microsoft.com/en-us/collaboration/fourthparadigm/},
Citeulike-Linkout-1 = {http://www.worldcat.org/isbn/9780982544204},
Date-Added = {2012-06-11 15:33:50 +1000},
Date-Modified = {2012-06-11 15:34:55 +1000},
Howpublished = {http://research.microsoft.com/en-us/collaboration/fourthparadigm/},
Isbn = {9780982544204},
Keywords = {scientific methodology},
Posted-At = {2009-12-21 11:47:52},
Priority = {2},
Publisher = {Microsoft Research},
Title = {{The Fourth Paradigm: Data-Intensive Scientific Discovery}},
Url = {http://research.microsoft.com/en-us/collaboration/fourthparadigm/},
Year = {2009},
Bdsk-Url-1 = {http://research.microsoft.com/en-us/collaboration/fourthparadigm/}}
@misc{lohr12,
Author = {Steve Lohr},
Date-Added = {2012-06-11 14:46:13 +1000},
Date-Modified = {2012-07-12 11:56:52 +1000},
Howpublished = {{\url{http://www.nytimes.com/2012/02/12/sunday-review/big-datas-impact-in-the-world.html?_r=1}}},
Keywords = {big data, digital revolution},
Lastchecked = {June 11, 2012},
Note = {{(Accessed on June 11, 2012)}},
Title = {The Age of Big Data},
Url = {http://www.nytimes.com/2012/02/12/sunday-review/big-datas-impact-in-the-world.html?_r=1},
Year = {2012},
Bdsk-Url-1 = {http://www.nytimes.com/2012/02/12/sunday-review/big-datas-impact-in-the-world.html?_r=1}}
@misc{insel2012,
Author = {Thomas Insel},
Date-Added = {2012-06-11 14:34:56 +1000},
Date-Modified = {2012-06-11 15:49:06 +1000},
Howpublished = {\url{http://www.nimh.nih.gov/about/director/2012/an-emerging-era-of-big-data.shtml}},
Keywords = {big data, digital revolution},
Lastchecked = {June 11, 2012},
Month = {February},
Note = {{(Accessed on June 11, 2012)}},
Title = {An Emerging Era of Big Data},
Url = {http://www.nimh.nih.gov/about/director/2012/an-emerging-era-of-big-data.shtml},
Year = {2012},
Bdsk-Url-1 = {http://www.nimh.nih.gov/about/director/2012/an-emerging-era-of-big-data.shtml}}
@article{hebert2007,
Address = {Amsterdam, The Netherlands, The Netherlands},
Author = {H\'{e}bert, C\'{e}line and Bretto, Alain and Cr\'{e}milleux, Bruno},
Date-Added = {2012-06-07 12:32:54 +1000},
Date-Modified = {2012-06-07 12:38:00 +1000},
Issn = {0169-2968},
Journal = {Fundamenta Informaticae},
Keywords = {hypergraph dualization, minimal transversals, hypergraph, hitting set, formal concept analysis},
Number = {4},
Numpages = {19},
Pages = {415--433},
Publisher = {IOS Press},
Title = {A Data Mining Formalization to Improve Hypergraph Minimal Transversal Computation},
Url = {http://dl.acm.org/citation.cfm?id=1366548.1366552},
Volume = {80},
Year = {2007},
Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=1366548.1366552}}
@article{murakami11,
Author = {Keisuke Murakami and Takeaki Uno},
Date-Added = {2012-06-07 11:31:14 +1000},
Date-Modified = {2012-06-07 11:33:45 +1000},
Journal = {CoRR},
Keywords = {hypergraph, minimal transversals, hitting set, emerging patterns, maximal frequent itemsets, hypergraph dualization},
Title = {Efficient Algorithms for Dualizing Large-Scale Hypergraphs},
Url = {http://arxiv.org/abs/1102.3813},
Volume = {abs/1102.3813},
Year = {2011},
Bdsk-Url-1 = {http://arxiv.org/abs/1102.3813}}
@incollection{ken03,
Abstract = {In this paper, we give a new algorithm for enumerating all maximal frequent sets using dualization. Frequent sets in transaction data has been used for computing association rules. Maximal frequent sets are important in representing frequent sets in a compact form, thus many researchers have proposed algorithms for enumerating maximal frequent sets. Among these algorithms, some researchers proposed algorithms for enumerating both maximal frequent sets and minimal infrequent sets in a primal-dual way by using a computation of the minimal transversal for a hypergraph, or in other words, hypergraph dualization. We give an improvement for this kind of algorithms in terms of the number of queries of frequency and the space complexity. Our algorithm checks each minimal infrequent set just once, while the existing algorithms check more than once, possibly so many times. Our algorithm does not store the minimal infrequent sets in memory, while the existing algorithms have to store them. The main idea of the improvement is that minimal infrequent sets computed from maximal frequent sets by dualization is still minimal infrequent even if we add a set to the current maximal frequent sets. We analyze the query complexity and the space complexity of our algorithm theoretically, and experimentally evaluate the algorithm to show that the computation time on average is in the order of the multiplication of the number of maximal frequent sets and the number of minimal infrequent sets.},
Author = {Satoh, Ken and Uno, Takeaki},
Booktitle = {Discovery Science},
Date-Added = {2012-06-07 11:26:26 +1000},
Date-Modified = {2012-06-07 11:28:07 +1000},
Editor = {Grieser, Gunter and Tanaka, Yuzuru and Yamamoto, Akihiro},
Isbn = {978-3-540-20293-6},
Keywords = {maximal frequent itemsets, Maximal frequent patterns, maximal itemset mining, hypergraph, minimal transversals, hitting set},
Pages = {256-268},
Publisher = {Springer Berlin / Heidelberg},
Series = {Lecture Notes in Computer Science},
Title = {Enumerating Maximal Frequent Sets Using Irredundant Dualization},
Url = {http://dx.doi.org/10.1007/978-3-540-39644-4_22},
Volume = {2843},
Year = {2003},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-540-39644-4_22}}
@inproceedings{bailey03,
Abstract = { Computing the minimal transversals of a hypergraph is an important problem in computer science that has significant applications in data mining. We present a new algorithm for computing hypergraph transversals and highlight their close connection to an important class of patterns known as emerging patterns. We evaluate our technique on a number of large datasets and show that it outperforms previous approaches by a factor of 9-29 times.},
Author = {Bailey, J. and Manoukian, T. and Kotagiri Ramamohanarao},
Booktitle = {Third IEEE International Conference on Data Mining},
Date-Added = {2012-06-06 12:38:01 +1000},
Date-Modified = {2012-06-06 12:39:21 +1000},
Doi = {10.1109/ICDM.2003.1250958},
Keywords = {data mining; emerging patterns; minimal transversals; hypergraph; hitting set},
Pages = {485 - 488},
Title = {A fast algorithm for computing hypergraph transversals and its application in mining emerging patterns},
Year = {2003},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/ICDM.2003.1250958}}
@article{vazquez09,
Abstract = {BACKGROUND:Identifying effective drug combinations that significantly improve over single agents is a challenging problem. Pairwise combinations already represent a huge screening effort. Beyond two drug combinations the task seems unfeasible.RESULTS:In this work we introduce a method to uncover drug combinations with a putative effective response when presented to a heterogeneous population of malignant agents (strains), such as cancer cell lines or viruses. Using data quantifying the effect of single drugs over several individual strains, we search for minimal drug combinations that successfully target all strains. We show that the latter problem can be mapped to a minimal hitting set problem in mathematics. We illustrate this approach using data for the NCI60 panel of tumor derived cell lines, uncovering 14 anticancer drug combinations.CONCLUSION:The drug-response graph and the associated minimal hitting set method can be used to uncover effective drug combinations in anticancer drug screens and drug development programs targeting heterogeneous populations of infectious agents such as HIV.},
Author = {Vazquez, Alexei},
Date-Added = {2012-06-06 12:28:25 +1000},
Date-Modified = {2012-06-06 12:29:08 +1000},
Doi = {10.1186/1752-0509-3-81},
Issn = {1752-0509},
Journal = {BMC Systems Biology},
Keywords = {hitting set, minimal transversals, hypergraph},
Number = {1},
Pages = {81},
Pubmedid = {19660129},
Title = {Optimal drug combinations and minimal hitting sets},
Url = {http://www.biomedcentral.com/1752-0509/3/81},
Volume = {3},
Year = {2009},
Bdsk-Url-1 = {http://www.biomedcentral.com/1752-0509/3/81},
Bdsk-Url-2 = {http://dx.doi.org/10.1186/1752-0509-3-81}}
@article{mellor2010,
Abstract = {
<p>Therapies consisting of a combination of agents are an attractive proposition,
especially in the context of diseases such as cancer, which can manifest with a
variety of tumor types in a single case. However uncovering usable drug
combinations is expensive both financially and temporally. By employing
computational methods to identify candidate combinations with a greater
likelihood of success we can avoid these problems, even when the amount of data
is prohibitively large. H<sc>itting</sc> S<sc>et</sc> is a combinatorial problem
that has useful application across many fields, however as it is
<italic>NP</italic>-complete it is traditionally considered hard to solve
exactly. We introduce a more general version of the problem
(<italic>α,β,d</italic>)-H<sc>itting</sc> S<sc>et</sc>,
which allows more precise control over how and what the hitting set targets.
Employing the framework of Parameterized Complexity we show that despite being
<italic>NP</italic>-complete, the
(<italic>α,β,d</italic>)-H<sc>itting</sc> S<sc>et</sc>
problem is fixed-parameter tractable with a kernel of size <italic>O</italic>(α<italic>dk<sup>d</sup></italic>) when we parameterize by the size <italic>k</italic> of the
hitting set and the maximum number α of the minimum number of hits,
and taking the maximum degree <italic>d</italic> of the target sets as a
constant. We demonstrate the application of this problem to multiple drug
selection for cancer therapy, showing the flexibility of the problem in
tailoring such drug sets. The fixed-parameter tractability result indicates that
for low values of the parameters the problem can be solved quickly using exact
methods. We also demonstrate that the problem is indeed practical, with
computation times on the order of 5 seconds, as compared to previous Hitting Set
applications using the same dataset which exhibited times on the order of 1 day,
even with relatively relaxed notions for what constitutes a low value for the
parameters. Furthermore the existence of a kernelization for
(<italic>α,β,d</italic>)-H<sc>itting</sc> S<sc>et</sc>
indicates that the problem is readily scalable to large datasets.</p>
},
Author = {Mellor, Drew AND Prieto, Elena AND Mathieson, Luke AND Moscato, Pablo},
Date-Added = {2012-06-06 12:22:12 +1000},
Date-Modified = {2012-07-12 11:57:30 +1000},
Doi = {10.1371/journal.pone.0013055},
Journal = {PLoS ONE},
Keywords = {hitting set, minimal transversals, hypergraph},
Number = {10},
Pages = {e13055},
Publisher = {Public Library of Science},
Title = {{A Kernelisation Approach for Multiple \emph{d}-Hitting Set and Its Application in Optimal Multi-Drug Therapeutic Combinations}},
Url = {http://dx.doi.org/10.1371%2Fjournal.pone.0013055},
Volume = {5},
Year = {2010},
Bdsk-Url-1 = {http://dx.doi.org/10.1371/journal.pone.0013055}}
@book{berge73,
Author = {Claude Berge},
Date-Added = {2012-06-05 12:06:50 +1000},
Date-Modified = {2012-06-05 12:12:22 +1000},
Keywords = {hypergraph, graph, minimal transversals},
Publisher = {Elsevier},
Series = {North-Holland Mathematical Library},
Title = {Graphs and Hypergraphs},
Url = {http://www.sciencedirect.com/science/article/pii/S0924650909703265},
Volume = {6},
Year = {1973},
Bdsk-Url-1 = {http://www.sciencedirect.com/science/article/pii/S0924650909703265}}
@phdthesis{li01_thesis,
Author = {Jinyan Li},
Date-Added = {2012-06-04 17:34:27 +1000},
Date-Modified = {2012-06-04 17:37:14 +1000},
Keywords = {emerging patterns, jumping emerging patterns, associative classification, classification},
Read = {Yes},
School = {University of Melbourne},
Title = {Mining emerging patterns to construct accurate and efficient classifiers},
Year = {2001}}
@article{vimieiro12,
Abstract = {Disjunctive minimal generators were proposed by Zhao, Zaki, and Ramakrishnan (2006). They defined disjunctive closed itemsets and disjunctive minimal generators through the disjunctive support function. We prove that the disjunctive support function is compatible with the closure operator presented by Zhao et al. (2006). Such compatibility allows us to adapt the original version of the Titanic algorithm, proposed by Stumme, Taouil, Bastide, Pasquier, and Lakhal (2002) to mine iceberg concept lattices and closed itemsets, to mine disjunctive minimal generators. We present TitanicOR, a new breadth-first algorithm for mining disjunctive minimal generators. We evaluate the performance of our method with both synthetic and real data sets and compare TitanicOR's performance with the performance of BLOSOM (Zhao et al., 2006), the state of the art method and sole algorithm available prior to TitanicOR for mining disjunctive minimal generators. We show that TitanicOR's breadth-first approach is up to two orders of magnitude faster than BLOSOM's depth-first approach.},
Author = {Vimieiro, Renato and Moscato, Pablo},
Doi = {10.1016/j.eswa.2012.01.141},
Issn = {0957-4174},
Journal = {Expert Systems with Applications},
Keywords = {BLOSOM,Boolean expressions,Closed itemsets,Disjunctions,Frequent pattern mining,Minimal generators,TitanicOR},
Number = {9},
Pages = {8228--8238},
Title = {{Mining disjunctive minimal generators with TitanicOR}},
Url = {http://www.sciencedirect.com/science/article/pii/S0957417412001613},
Volume = {39},
Year = {2012},
Bdsk-Url-1 = {http://www.sciencedirect.com/science/article/pii/S0957417412001613},
Bdsk-Url-2 = {http://dx.doi.org/10.1016/j.eswa.2012.01.141}}
@article{vimieiro04,
Author = {Vimieiro, Renato and Z\'{a}rate, LE and Silva, Jpd},
Journal = {Intelligence and Soft},
Keywords = {formal concept anal,neural networks,rule extraction},
Pages = {334--339},
Title = {{Rule extraction from trained neural networks via formal concept analysis}},
Url = {http://www.actapress.com/Abstract.aspx?paperId=18596},
Year = {2004},
Bdsk-Url-1 = {http://www.actapress.com/Abstract.aspx?paperId=18596}}
@inproceedings{loekito06,
Address = {New York, NY, USA},
Author = {Loekito, Elsa and Bailey, James},
Booktitle = {Proceedings of the 12th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
Doi = {http://doi.acm.org/10.1145/1150402.1150438},
Isbn = {1-59593-339-5},
Keywords = {classification, data mining, disjunctive emerging patterns, disjunctive patterns, emerging patterns, zero-suppressed binary decision diagrams,contrast patterns},
Pages = {307--316},
Publisher = {ACM},
Series = {KDD '06},
Title = {{Fast mining of high dimensional expressive contrast patterns using zero-suppressed binary decision diagrams}},
Url = {http://doi.acm.org/10.1145/1150402.1150438},
Year = {2006},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/1150402.1150438}}
@inproceedings{loekito09,
Address = {Berlin, Heidelberg},
Author = {Loekito, Elsa and Bailey, James},
Booktitle = {Proceedings of the 13th Pacific-Asia Conference on Advances in Knowledge Discovery and Data Mining},
Doi = {http://dx.doi.org/10.1007/978-3-642-01307-2\_44},
Isbn = {978-3-642-01306-5},
Keywords = {classification, data mining, disjunctive emerging patterns, disjunctive patterns, emerging patterns, quantitative association rules,expressive contrasts},
Pages = {483--490},
Publisher = {Springer-Verlag},
Series = {PAKDD '09},
Title = {{Using Highly Expressive Contrast Patterns for Classification - Is It Worthwhile?}},
Url = {http://dx.doi.org/10.1007/978-3-642-01307-2\_44},
Year = {2009},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-01307-2%5C_44}}
@inproceedings{wang03,
Address = {New York, NY, USA},
Author = {Wang, Jianyong and Han, Jiawei and Pei, Jian},
Booktitle = {Proceedings of the Ninth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
Doi = {http://doi.acm.org/10.1145/956750.956779},
Isbn = {1-58113-737-0},
Keywords = {frequent closed itemsets, mining methods and algorithms,association rules},
Pages = {236--245},
Publisher = {ACM},
Series = {KDD '03},
Title = {{{CLOSET+}: searching for the best strategies for mining frequent closed itemsets}},
Url = {http://doi.acm.org/10.1145/956750.956779},
Year = {2003},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/956750.956779}}
@article{grahne05,
Abstract = {Efficient algorithms for mining frequent itemsets are crucial for
mining association rules as well as for many other data mining tasks. Methods
for mining frequent itemsets have been implemented using a prefix-tree
structure, known as an FP-tree, for storing compressed information about
frequent itemsets. Numerous experimental results have demonstrated that these
algorithms perform extremely well. In this paper, we present a novel FP-array
technique that greatly reduces the need to traverse FP-trees, thus obtaining
significantly improved performance for FP-tree-based algorithms. Our technique
works especially well for sparse data sets. Furthermore, we present new
algorithms for mining all, maximal, and closed frequent itemsets. Our algorithms
use the FP-tree data structure in combination with the FP-array technique
efficiently and incorporate various optimization techniques. We also present
experimental results comparing our methods with existing algorithms. The results
show that our methods are the fastest for many cases. Even though the algorithms
consume much memory when the data sets are sparse, they are still the fastest
ones when the minimum support is low. Moreover, they are always among the
fastest algorithms and consume less memory than other methods when the data sets
are dense.},
Author = {Grahne, G and Zhu, J},
Doi = {10.1109/TKDE.2005.166},
Issn = {1041-4347},
Journal = {IEEE Transactions on Knowledge and Data Engineering},
Keywords = {association rules; data mining; frequent itemset m},
Number = {10},
Pages = {1347--1362},
Title = {{Fast algorithms for frequent itemset mining using {FP}-trees}},
Volume = {17},
Year = {2005},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/TKDE.2005.166}}
@inproceedings{cong05,
Address = {New York, NY, USA},
Author = {Cong, Gao and Tan, Kian-Lee and Tung, Anthony K H and Xu, Xin},
Booktitle = {Proceedings of the 2005 ACM SIGMOD International Conference on Management of Data},
Doi = {http://doi.acm.org/10.1145/1066157.1066234},
Isbn = {1-59593-060-4},
Pages = {670--681},
Publisher = {ACM},
Series = {SIGMOD '05},
Title = {{Mining top-K covering rule groups for gene expression data}},
Url = {http://doi.acm.org/10.1145/1066157.1066234},
Year = {2005},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/1066157.1066234}}
@inproceedings{cheng07,
Abstract = {The application of frequent patterns in classification
appeared in sporadic studies and achieved initial success in the
classification of relational data, text documents and graphs. In this
paper, we conduct a systematic exploration of frequent pattern-based
classification, and provide solid reasons supporting this methodology.
It was well known that feature combinations (patterns) could capture
more underlying semantics than single features. However, inclusion of
infrequent patterns may not significantly improve the accuracy due to
their limited predictive power. By building a connection between pattern
frequency and discriminative measures such as information gain and
Fisher score, we develop a strategy to set minimum support in frequent
pattern mining for generating useful patterns. Based on this strategy,
coupled with a proposed feature selection algorithm, discriminative
frequent patterns can be generated for building high quality
classifiers. We demonstrate that the frequent pattern-based
classification framework can achieve good scalability and high accuracy
in classifying large datasets. Empirical studies indicate that
significant improvement in classification accuracy is achieved (up to
12\% in UCI datasets) using the so-selected discriminative frequent
patterns.},
Author = {Cheng, Hong and Yan, Xifeng and Han, Jiawei and Hsu, Chih-Wei},
Booktitle = {Proceedings of the 23rd IEEE International Conference on Data Engineering, ICDE 2007},
Doi = {10.1109/ICDE.2007.367917},
Keywords = {discriminative pattern mining;frequent pattern min},
Pages = {716--725},
Title = {{Discriminative Frequent Pattern Analysis for Effective Classification}},
Year = {2007},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/ICDE.2007.367917}}
@article{wang06,
Abstract = {Many studies have shown that rule-based classifiers perform well in
classifying categorical and sparse high-dimensional databases. However, a
fundamental limitation with many rule-based classifiers is that they find the
rules by employing various heuristic methods to prune the search space and
select the rules based on the sequential database covering paradigm. As a
result, the final set of rules that they use may not be the globally best rules
for some instances in the training database. To make matters worse, these
algorithms fail to fully exploit some more effective search space pruning
methods in order to scale to large databases. In this paper, we present a new
classifier, HARMONY, which directly mines the final set of classification rules.
HARMONY uses an instance-centric rule-generation approach and it can assure
that, for each training instance, one of the highest-confidence rules covering
this instance is included in the final rule set, which helps in improving the
overall accuracy of the classifier. By introducing several novel search
strategies and pruning methods into the rule discovery process, HARMONY also has
high efficiency and good scalability. Our thorough performance study with some
large text and categorical databases has shown that HARMONY outperforms many
well-known classifiers in terms of both accuracy and computational efficiency
and scales well with regard to the database size},
Author = {Wang, Jianyong and Karypis, George.},
Doi = {10.1109/TKDE.2006.179},
Issn = {1041-4347},
Journal = {IEEE Transactions on Knowledge and Data Engineering},
Keywords = {HARMONY;data mining;FP-Growth;associative classifi},
Number = {11},
Pages = {1497--1511},
Title = {{On Mining Instance-Centric Classification Rules}},
Volume = {18},
Year = {2006},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/TKDE.2006.179}}
@inproceedings{li01_CMAR,
Abstract = {Previous studies propose that associative classification has high
classification accuracy and strong flexibility at handling unstructured data.
However, it still suffers from the huge set of mined rules and sometimes biased
classification or overfitting since the classification is based on only a single
high-confidence rule. The authors propose a new associative classification
method, CMAR, i.e., Classification based on Multiple Association Rules. The
method extends an efficient frequent pattern mining method, FP-growth,
constructs a class distribution-associated FP-tree, and mines large databases
efficiently. Moreover, it applies a CR-tree structure to store and retrieve
mined association rules efficiently, and prunes rules effectively based on
confidence, correlation and database coverage. The classification is performed
based on a weighted chi;2 analysis using multiple strong association rules. Our
extensive experiments on 26 databases from the UCI machine learning database
repository show that CMAR is consistent, highly effective at classification of
various kinds of databases and has better average classification accuracy in
comparison with CBA and C4.5. Moreover, our performance study shows that the
method is highly efficient and scalable in comparison with other reported
associative classification methods},
Author = {Li, Wenmin and Han, Jiawei and Pei, Jian},
Booktitle = {Proceedings of the IEEE International Conference on Data Mining (ICDM 2001)},
Doi = {10.1109/ICDM.2001.989541},
Keywords = {CMAR;FP-growth;associative classification;frequent},
Pages = {369--376},
Title = {{{CMAR}: accurate and efficient classification based on multiple class-association rules}},
Year = {2001},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/ICDM.2001.989541}}
@book{quinlan1993,
Address = {San Francisco, CA, USA},
Author = {Quinlan, J Ross},
Isbn = {1-55860-238-0},
Keywords = {data mining,classification},
Publisher = {Morgan Kaufmann Publishers Inc.},
Title = {{C4.5: programs for machine learning}},
Year = {1993}}
@incollection{dong99_CAEP,
Abstract = {Emerging patterns (EPs) are itemsets whose supports change
significantly from one dataset to another; they were recently proposed to
capture multi-attribute contrasts between data classes, or trends over time. In
this paper we propose a new classifier, CAEP, using the following main ideas
based on EPs: (i) Each EP can sharply differentiate the class membership of a
(possibly small) fraction of instances containing the EP, due to the big
difference between its supports in the opposing classes; we define the
differentiating power of the EP in terms of the supports and their ratio, on
instances containing the EP. (ii) For each instance t , by aggregating the
differentiating power of a fixed, automatically selected set of EPs, a score is
obtained for each class. The scores for all classes are normalized and the
largest score determines t 's class. CAEP is suitable for many applications,
even those with large volumes of high (e.g. 45) dimensional data; it does not
depend on dimension reduction on data; and it is usually equally accurate on all
classes even if their populations are unbalanced. Experiments show that CAEP has
consistent good predictive accuracy, and it almost always outperforms C4.5 and
CBA. By using efficient, border-based algorithms (developed elsewhere) to
discover EPs, CAEP scales up on data volume and dimensionality. Observing that
accuracy on the whole dataset is too coarse description of classifiers, we also
used a more accurate measure, sensitivity and precision , to better characterize
the performance of classifiers. CAEP is also very good under this measure.},
Author = {Dong, Guozhu and Zhang, Xiuzhen and Wong, Limsoon and Li, Jinyan},
Booktitle = {Discovery Science},
Editor = {Arikawa, Setsuo and Furukawa, Koichi},
Isbn = {978-3-540-66713-1},
Keywords = {associative classification, emerging patterns, frequent itemset mining, frequent pattern mining, jumping emerging patterns, maximal frequent itemsets,data mining},
Pages = {737},
Publisher = {Springer Berlin / Heidelberg},
Series = {Lecture Notes in Computer Science},
Title = {{{CAEP}: Classification by Aggregating Emerging Patterns}},
Url = {http://dx.doi.org/10.1007/3-540-46846-3\_4},
Volume = {1721},
Year = {1999},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/3-540-46846-3%5C_4}}
@article{li01,
Abstract = {Classification aims to discover a model from training data that
can be used to predict the class of test instances. In this paper, we propose
the use of jumping emerging patterns (JEPs) as the basis for a new classifier
called the JEP-Classifier . Each JEP can capture some crucial difference
between a pair of datasets. Then, aggregating all JEPs of large supports can
produce a more potent classification power. Procedurally, the JEP-Classifier
learns the pair-wise features (sets of JEPs) contained in the training data,
and uses the collective impacts contributed by the most expressive pair-wise
features to determine the class labels of the test data. Using only the most
expressive JEPs in the JEP-Classifier strengthens its resistance to noise in
the training data, and reduces its complexity (as there are usually a very
large number of JEPs). We use two algorithms for constructing the
JEP-Classifier which are both scalable and efficient. These algorithms make
use of the border representation to efficiently store and manipulate JEPs. We
also present experimental results which show that the JEP-Classifier achieves
much higher testing accuracies than the association-based classifier of (Liu
et al, 1998), which was reported to outperform C4.5 in general.},
Author = {Li, Jinyan and Dong, Guozhu and Ramamohanarao, Kotagiri},
Issn = {0219-1377},
Journal = {Knowledge and Information Systems},
Keywords = {associative classification, emerging patterns, frequent itemset mining, frequent pattern mining, jumping emerging patterns, maximal frequent itemsets,data mining},
Number = {2},
Pages = {131--145},
Publisher = {Springer London},
Title = {{Making Use of the Most Expressive Jumping Emerging Patterns for Classification}},
Url = {http://dx.doi.org/10.1007/PL00011662},
Volume = {3},
Year = {2001},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/PL00011662}}
@inproceedings{liu98,
Author = {Liu, Bing and Hsu, Wynne and Ma, Yiming},
Booktitle = {Proceedings of the Fourth International Conference on Knowledge Discovery and Data Mining (KDD-98)},
Editor = {Agrawal, Rakesh and Stolorz, Paul E and Piatetsky-Shapiro, Gregory},
Keywords = {CBA, discriminative pattern mining, frequent itemset mining; data mining, frequent pattern mining,associative classification},
Publisher = {AAAI Press},
Title = {{Integrating Classification and Association Rule Mining}},
Year = {1998}}
@article{guizhen06,
Abstract = {In this paper we study the complexity-theoretic aspects of mining maximal frequent patterns,
from the perspective of counting the number of all distinct solutions.
We present the first formal proof that the problem of counting the number of maximal frequent itemsets
in a database of transactions, given an arbitrary support threshold, is \#P-complete, thereby providing
theoretical evidence that the problem of mining maximal frequent itemsets is NP-hard. We also extend our
complexity analysis to other similar data mining problems that deal with complex data structures, such as
sequences, trees, and graphs. We investigate several variants of these mining problems in which the
patterns of interest are subsequences, subtrees, or subgraphs, and show that the associated problems of
counting the number of maximal frequent patterns are all either \#P-complete or \#P-hard.},
Author = {Yang, Guizhen},
Doi = {10.1016/j.tcs.2006.05.029},
Issn = {0304-3975},
Journal = {Theoretical Computer Science},
Keywords = {\#P-complete,Complexity,Data mining,Maximal frequent patterns},
Number = {1-3},
Pages = {63--85},
Title = {{Computational aspects of mining maximal frequent patterns}},
Url = {http://www.sciencedirect.com/science/article/pii/S0304397506003355},
Volume = {362},
Year = {2006},
Bdsk-Url-1 = {http://www.sciencedirect.com/science/article/pii/S0304397506003355},
Bdsk-Url-2 = {http://dx.doi.org/10.1016/j.tcs.2006.05.029}}
@article{piatetsky-shapiro03,
Address = {New York, NY, USA},
Author = {Piatetsky-Shapiro, Gregory and Tamayo, Pablo},
Date-Modified = {2012-07-12 11:58:30 +1000},
Doi = {http://doi.acm.org/10.1145/980972.980974},
Issn = {1931-0145},
Journal = {SIGKDD Explorations Newsletter},
Number = {2},
Pages = {1--5},
Publisher = {ACM},
Title = {{Microarray data mining: facing the challenges}},
Url = {http://doi.acm.org/10.1145/980972.980974},
Volume = {5},
Year = {2003},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/980972.980974}}
@article{dries10,
Abstract = {We adapt Mitchell's version space algorithm for mining k-CNF formulas.
Advantages of this algorithm are that it runs in a single pass over the data, is
conceptually simple, can be used for missing value prediction, and has
interesting theoretical properties, while an empirical evaluation on
classification tasks yields competitive predictive results.},
Author = {Dries, A and {De Raedt}, L and Nijssen, S},
Doi = {10.1109/TKDE.2009.152},
Issn = {1041-4347},
Journal = {IEEE Transactions on Knowledge and Data Engineering},
Keywords = {Boolean expressions;conjunctive normal form;data m},
Month = may,
Number = {5},
Pages = {743--748},
Title = {{Mining Predictive k-{CNF} Expressions}},
Volume = {22},
Year = {2010},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/TKDE.2009.152}}
@inproceedings{mannila_toivonen96,
Author = {Mannila, Heikki and Toivonen, Hannu},
Booktitle = {Proceedings of the Second International Conference on Knowledge Discovery and Data Mining},
Editor = {Simoudis, Evangelos and Han, Jia W and Fayyad, Usama},
Keywords = {dijunctive pattern mining, frequent itemset mining, frequent pattern mining,data mining},
Pages = {189--194},
Publisher = {AAAI Press},
Title = {{Multiple Uses of Frequent Sets and Condensed Representations: Extended Abstract}},
Year = {1996}}
@article{hughes00,
Abstract = {Ascertaining the impact of uncharacterized perturbations on
the cell is a fundamental problem in biology. Here, we describe how a
single assay can be used to monitor hundreds of different cellular
functions simultaneously. We constructed a reference database or
"compendium" of expression profiles corresponding to 300 diverse
mutations and chemical treatments in S. cerevisiae, and we show that the
cellular pathways affected can be determined by pattern matching, even
among very subtle profiles. The utility of this approach is validated by
examining profiles caused by deletions of uncharacterized genes: we
identify and experimentally confirm that eight uncharacterized open
reading frames encode proteins required for sterol metabolism, cell wall
function, mitochondrial respiration, or protein synthesis. We also show
that the compendium can be used to characterize pharmacological
perturbations by identifying a novel target of the commonly used drug
dyclonine.},
Address = {Rosetta Inpharmatics, Inc., Kirkland, Washington 98034, USA.},
Author = {Hughes, T R and Marton, M J and Jones, A R and Roberts, C J and Stoughton, R and Armour, C D and Bennett, H A and Coffey, E and Dai, H and He, Y D and Kidd, M J and King, A M and Meyer, M R and Slade, D and Lum, P Y and Stepaniants, S B and Shoemaker, D D and Gachotte, D and Chakraburtty, K and Simon, J and Bard, M and Friend, S H},
Doi = {10.1016/S0092-8674(00)00015-5},
Issn = {0092-8674},
Journal = {Cell},
Keywords = {networks, yeastDataset,datasets},
Number = {1},
Pages = {109--126},
Title = {{Functional discovery via a compendium of expression profiles}},
Url = {http://www.cell.com/retrieve/pii/S0092867400000155},
Volume = {102},
Year = {2000},
Bdsk-Url-1 = {http://www.cell.com/retrieve/pii/S0092867400000155},
Bdsk-Url-2 = {http://dx.doi.org/10.1016/S0092-8674(00)00015-5}}
@inproceedings{cong04b,
Abstract = { Microarray data typically contains a large number of
columns and a small number of rows, which poses a great challenge for
existing frequent (closed) pattern mining algorithms that discover
patterns in item enumeration space. In this paper, we propose two
algorithms that explore the row enumeration space to mine frequent
closed patterns. Several experiments on real-life gene expression data
show that the algorithms are faster than existing algorithms, including
CLOSET, CHARM, CLOSET+ and CARPENTER.},
Author = {Cong, Gao and Tan, Kian-Lee and Tung, A K H and Pan, Feng},
Booktitle = {ICDM '04: Proceedings of the Fourth IEEE International Conference on Data Mining},
Doi = {10.1109/ICDM.2004.10070},
Keywords = {frequent closed pattern mining; item enumeration s},
Pages = {363--366},
Title = {{Mining frequent closed patterns in microarray data}},
Year = {2004},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/ICDM.2004.10070}}
@article{mcintosh07,
Abstract = {We present an association rule mining method for mining high
confidence rules, which describe interesting gene relationships from
microarray datasets. Microarray datasets typically contain an order of
magnitude more genes than experiments, rendering many data mining
methods impractical as they are optimised for sparse datasets. A new
family of row-enumeration rule mining algorithms have emerged to
facilitate mining in dense datasets. These algorithms rely on pruning
infrequent relationships to reduce the search space by using the support
measure. This major shortcoming results in the pruning of many
potentially interesting rules with low support but high confidence. We
propose a new row-enumeration rule mining method, MaxConf, to mine high
confidence rules from microarray data. MaxConf is a support-free
algorithm which directly uses the confidence measure to effectively
prune the search space. Experiments on three microarray datasets show
that MaxConf outperforms support-based rule mining with respect to
scalability and rule extraction. Furthermore, detailed biological
analyses demonstrate the effectiveness of our approach -- the rules
discovered by MaxConf are substantially more interesting and meaningful
compared with support-based methods.},
Address = {Los Alamitos, CA, USA},
Author = {McIntosh, Tara and Chawla, Sanjay},
Date-Modified = {2012-07-12 11:57:09 +1000},
Doi = {http://dx.doi.org/10.1109/tcbb.2007.1050},
Issn = {1545-5963},
Journal = {IEEE/ACM Transactions on Computational Biology and Bioinformatics},
Keywords = {association rules, frequent itemset mining, frequent pattern mining, high confidence rule mining, microarray analysis, vertical itemset mining,Data mining},
Number = {4},
Pages = {611--623},
Publisher = {IEEE Computer Society Press},
Title = {{High Confidence Rule Mining for Microarray Analysis}},
Url = {http://dx.doi.org/10.1109/tcbb.2007.1050},
Volume = {4},
Year = {2007},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/tcbb.2007.1050}}
@article{delgado01,