-
Notifications
You must be signed in to change notification settings - Fork 0
/
array-programming-acm-accepted.bib
1764 lines (1762 loc) · 97.9 KB
/
array-programming-acm-accepted.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@inproceedings{Venkat:2014:NEP:2581122.2544141,
author = {Venkat, Anand and Shantharam, Manu and Hall, Mary
and Strout, Michelle Mills},
title = {Non-affine Extensions to Polyhedral Code Generation},
booktitle = {Proceedings of Annual IEEE/ACM International
Symposium on Code Generation and Optimization},
series = {CGO '14},
year = {2014},
isbn = {978-1-4503-2670-4},
location = {Orlando, FL, USA},
pages = {185:185--185:194},
articleno = {185},
numpages = {10},
url = {http://doi.acm.org/10.1145/2544137.2544141},
doi = {10.1145/2544137.2544141},
acmid = {2544141},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {code generation, inspector/executor, loop
coalescing, non-affine, polyhedral model},
abstract = {This paper describes a loop transformation framework that extends a polyhedral representation of loop nests to represent and transform computations with non-affine index arrays in loop bounds and subscripts via a new interface between compile-time and run-time abstractions. Polyhedra scanning code generation, which historically applies an affine mapping to the subscript expressions of the statements in a loop nest, is modified to apply non-affine mappings involving index arrays that are represented at compile time by uninterpreted functions; non-affine loop bounds involving index arrays are also represented. When appropriate, an inspector is utilized to capture the non-affine subscript mappings, and a generalized loop coalescing transformation is introduced as a non-affine transformation to support non-affine loop bounds. With this support, complex sequences of new and existing transformations can then be composed. We demonstrate the effectiveness of this framework by optimizing sparse matrix vector multiplication operations targeting GPUs for different matrix structures and parallelization strategies. This approach achieves performance that is comparable to or greater than the hand-tuned CUSP library; for two of the implementations it achieves an average 1.14× improvement over CUSP across a collection of sparse matrices, while the third performs on average within \% of CUSP.},
notes = {Targets GPUs but seems a general technique.},
review = {fbie: accepted <2016-01-14 11:59:00>},
}
@inproceedings{Ureche:2012:SCS:2103746.2103762,
author = {Ureche, Vlad and Rompf, Tiark and Sujeeth, Arvind
and Chafi, Hassan and Odersky, Martin},
title = {StagedSAC: A Case Study in Performance-oriented DSL
Development},
booktitle = {Proceedings of the ACM SIGPLAN 2012 Workshop on
Partial Evaluation and Program Manipulation},
series = {PEPM '12},
year = {2012},
isbn = {978-1-4503-1118-2},
location = {Philadelphia, Pennsylvania, USA},
pages = {73--82},
numpages = {10},
url = {http://doi.acm.org/10.1145/2103746.2103762},
doi = {10.1145/2103746.2103762},
acmid = {2103762},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {DSL, SAC, domain specific languages, optimization,
single assignment c, staging},
abstract = {Domain-specific languages (DSLs) can bridge the gap between high-level programming and efficient execution. However, implementing compiler tool-chains for performance oriented DSLs requires significant effort. Recent research has produced methodologies and frameworks that promise to reduce this development effort by enabling quick transition from library-only, purely embedded DSLs to optimizing compilation. In this case study we report on our experience implementing a compiler for StagedSAC. StagedSAC is a DSL for arithmetic processing with multidimensional arrays modeled after the stand-alone language SAC (Single Assignment C). The main language feature of both SAC and StagedSAC is a loop construction that enables high-level and concise implementations of array algorithms. At the same time, the functional semantics of the two languages allow for advanced compiler optimizations and parallel code generation. We describe how we were able to quickly evolve from a pure library DSL to a performance-oriented compiler with a good speedup and only minor syntax changes using the technique of Lightweight Modular Staging. We also describe the optimizations we perform to obtain fast code and how we plan to generate parallel code with minimal effort using the Delite framework.},
notes = {Last sentence of abstract seems interesting.},
review = {fbie: accepted <2016-01-14 11:48:40>},
}
@inproceedings{Tang:1990:CTD:77726.255155,
author = {Tang, Peiyi and Yew, Pen-Chung and Zhu, Chuan-Qi},
title = {Compiler Techniques for Data Synchronization in
Nested Parallel Loops},
booktitle = {Proceedings of the 4th International Conference on
Supercomputing},
series = {ICS '90},
year = {1990},
isbn = {0-89791-369-8},
location = {Amsterdam, The Netherlands},
pages = {177--186},
numpages = {10},
url = {http://doi.acm.org/10.1145/77726.255155},
doi = {10.1145/77726.255155},
acmid = {255155},
publisher = {ACM},
address = {New York, NY, USA},
abstract = {The major source of parallelism in ordinary programs is do loops. When loop iterations of parallelized loops are executed on multiprocessors, the cross-iteration data dependencies need to be enforced by synchronization between processors. Existing data synchronization schemes are either too simple to handle general nested loop structures with non-trivia array subscript functions or inefficient due to the large run-time overhead.
In this paper, we propose a new synchronization scheme based on two data-oriented synchronization instructions: synch_read(x,s) and synch_write(x,s). We present the algorithm to compute the ordering number, s, for each data access. Using our scheme, a parallelizing compiler can parallelize a general nested loop structure with complicated cross-iteration data dependencies. If the computations of ordering numbers cannot be done at compile time, the run-time overhead is smaller than the other existing run-time schemes.},
review = {fbie: accepted <2016-01-14 10:32:58>},
}
@article{Tang:1990:CTD:255129.255155,
author = {Tang, Peiyi and Yew, Pen-Chung and Zhu, Chuan-Qi},
title = {Compiler Techniques for Data Synchronization in
Nested Parallel Loops},
journal = {SIGARCH Comput. Archit. News},
issue_date = {Sept. 1990},
volume = {18},
number = {3b},
month = {jun},
year = {1990},
issn = {0163-5964},
pages = {177--186},
numpages = {10},
url = {http://doi.acm.org/10.1145/255129.255155},
doi = {10.1145/255129.255155},
acmid = {255155},
publisher = {ACM},
address = {New York, NY, USA},
abstract = {
The major source of parallelism in ordinary programs is do loops. When loop iterations of parallelized loops are executed on multiprocessors, the cross-iteration data dependencies need to be enforced by synchronization between processors. Existing data synchronization schemes are either too simple to handle general nested loop structures with non-trivia array subscript functions or inefficient due to the large run-time overhead.
In this paper, we propose a new synchronization scheme based on two data-oriented synchronization instructions: synch_read(x,s) and synch_write(x,s). We present the algorithm to compute the ordering number, s, for each data access. Using our scheme, a parallelizing compiler can parallelize a general nested loop structure with complicated cross-iteration data dependencies. If the computations of ordering numbers cannot be done at compile time, the run-time overhead is smaller than the other existing run-time schemes.
},
review = {fbie: accepted <2016-01-14 10:31:31>},
}
@inproceedings{Blelloch:1996:PTS:232627.232650,
author = {Blelloch, Guy E. and Greiner, John},
title = {A Provable Time and Space Efficient Implementation
of NESL},
booktitle = {Proceedings of the First ACM SIGPLAN International
Conference on Functional Programming},
series = {ICFP '96},
year = {1996},
isbn = {0-89791-770-7},
location = {Philadelphia, Pennsylvania, USA},
pages = {213--225},
numpages = {13},
url = {http://doi.acm.org/10.1145/232627.232650},
doi = {10.1145/232627.232650},
acmid = {232650},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=232650&ftid=38674&dwn=1&CFID=744742186&CFTOKEN=77967707},
review = {fbie: accepted <2016-01-13 14:09:49>},
abstract = {In this paper we prove time and space bounds for the implementation of the programming language NESL on various parallel machine models. NESL is a sugared typed &lambda;-calculus with a set of array primitives and an explicit parallel map over arrays. Our results extend previous work on provable implementation bounds for functional languages by considering space and by including arrays. For modeling the cost of NESL we augment a standard call-by-value operational semantics to return two cost measures: a DAG representing the sequential dependence in the computation, and a measure of the space taken by a sequential implementation. We show that a NESL program with w work (nodes in the DAG), d depth (levels in the DAG), and s sequential space can be implemented on a p processor butterfly network, hypercube, or CRCW PRAM using O(w/p + d log p) time and O(s + dp log p) reachable space.1 For programs with sufficient parallelism these bounds are optimal in that they give linear speedup and use space within a constant factor of the sequential space.},
}
@article{Blelloch:1996:PTS:232629.232650,
author = {Blelloch, Guy E. and Greiner, John},
title = {A Provable Time and Space Efficient Implementation
of NESL},
journal = {SIGPLAN Not.},
issue_date = {June 15, 1996},
volume = {31},
number = {6},
month = {jun},
year = {1996},
issn = {0362-1340},
pages = {213--225},
numpages = {13},
url = {http://doi.acm.org/10.1145/232629.232650},
doi = {10.1145/232629.232650},
acmid = {232650},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=232650&ftid=38674&dwn=1&CFID=744742186&CFTOKEN=77967707},
review = {fbie: accepted <2016-01-13 14:09:45>},
abstract = {In this paper we prove time and space bounds for the implementation of the programming language NESL on various parallel machine models. NESL is a sugared typed &lambda;-calculus with a set of array primitives and an explicit parallel map over arrays. Our results extend previous work on provable implementation bounds for functional languages by considering space and by including arrays. For modeling the cost of NESL we augment a standard call-by-value operational semantics to return two cost measures: a DAG representing the sequential dependence in the computation, and a measure of the space taken by a sequential implementation. We show that a NESL program with w work (nodes in the DAG), d depth (levels in the DAG), and s sequential space can be implemented on a p processor butterfly network, hypercube, or CRCW PRAM using O(w/p + d log p) time and O(s + dp log p) reachable space.1 For programs with sufficient parallelism these bounds are optimal in that they give linear speedup and use space within a constant factor of the sequential space.},
}
@inproceedings{Anderson:1990:CHA:93542.93561,
author = {Anderson, Steven and Hudak, Paul},
title = {Compilation of Haskell Array Comprehensions for
Scientific Computing},
booktitle = {Proceedings of the ACM SIGPLAN 1990 Conference on
Programming Language Design and Implementation},
series = {PLDI '90},
year = {1990},
isbn = {0-89791-364-7},
location = {White Plains, New York, USA},
pages = {137--149},
numpages = {13},
url = {http://doi.acm.org/10.1145/93542.93561},
doi = {10.1145/93542.93561},
acmid = {93561},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=93561&ftid=14811&dwn=1&CFID=744742186&CFTOKEN=77967707},
review = {fbie: accepted <2016-01-13 13:50:11>},
}
@article{Anderson:1990:CHA:93548.93561,
author = {Anderson, Steven and Hudak, Paul},
title = {Compilation of Haskell Array Comprehensions for
Scientific Computing},
journal = {SIGPLAN Not.},
issue_date = {Jun. 1990},
volume = {25},
number = {6},
month = {jun},
year = {1990},
issn = {0362-1340},
pages = {137--149},
numpages = {13},
url = {http://doi.acm.org/10.1145/93548.93561},
doi = {10.1145/93548.93561},
acmid = {93561},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=93561&ftid=14811&dwn=1&CFID=744742186&CFTOKEN=77967707},
fullTextFile = {.slirm_cache/Anderson_1990_Compilation-of.pdf},
notes = {Analysis of inter-data dependencies in array comprehensions.},
review = {fbie: accepted <2016-01-13 13:50:06>},
}
@inproceedings{Hall:1994:UHT:182409.156781,
author = {Hall, Cordelia V.},
title = {Using Hindley-Milner Type Inference to Optimise List
Representation},
booktitle = {Proceedings of the 1994 ACM Conference on LISP and
Functional Programming},
series = {LFP '94},
year = {1994},
isbn = {0-89791-643-3},
location = {Orlando, Florida, USA},
pages = {162--172},
numpages = {11},
url = {http://doi.acm.org/10.1145/182409.156781},
doi = {10.1145/182409.156781},
acmid = {156781},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=156781&ftid=25751&dwn=1&CFID=744742186&CFTOKEN=77967707},
review = {fbie: accepted <2016-01-13 13:45:12>},
}
@article{Hall:1994:UHT:182590.156781,
author = {Hall, Cordelia V.},
title = {Using Hindley-Milner Type Inference to Optimise List
Representation},
journal = {SIGPLAN Lisp Pointers},
issue_date = {July-Sept. 1994},
volume = {VII},
number = {3},
month = {jul},
year = {1994},
issn = {1045-3563},
pages = {162--172},
numpages = {11},
url = {http://doi.acm.org/10.1145/182590.156781},
doi = {10.1145/182590.156781},
acmid = {156781},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=156781&ftid=25751&dwn=1&CFID=744742186&CFTOKEN=77967707},
fullTextFile = {.slirm_cache/Hall_1994_Using-Hindley.pdf},
review = {fbie: accepted <2016-01-13 13:45:08>},
}
@inproceedings{Lowney:1981:CAI:567532.567533,
author = {Lowney, P. Geoffrey},
title = {Carrier Arrays: An Idiom-preserving Extension to
APL},
booktitle = {Proceedings of the 8th ACM SIGPLAN-SIGACT Symposium
on Principles of Programming Languages},
series = {POPL '81},
year = {1981},
isbn = {0-89791-029-X},
location = {Williamsburg, Virginia},
pages = {1--13},
numpages = {13},
url = {http://doi.acm.org/10.1145/567532.567533},
doi = {10.1145/567532.567533},
acmid = {567533},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=567533&ftid=84193&dwn=1&CFID=744742186&CFTOKEN=77967707},
fullTextFile = {.slirm_cache/Lowney_1981_Carrier-Arrays.pdf},
notes = {"A carrier array is a ragged array with an associated partition which allows functions to be applied to subarrays in parallel."},
review = {fbie: accepted <2016-01-13 13:41:28>},
}
@article{Perrott:1979:LAV:357073.357075,
author = {Perrott, R. H.},
title = {A Language for Array and Vector Processors},
journal = {ACM Trans. Program. Lang. Syst.},
issue_date = {Oct. 1979},
volume = {1},
number = {2},
month = {oct},
year = {1979},
issn = {0164-0925},
pages = {177--195},
numpages = {19},
url = {http://doi.acm.org/10.1145/357073.357075},
doi = {10.1145/357073.357075},
acmid = {357075},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=357075&ftid=51974&dwn=1&CFID=744742186&CFTOKEN=77967707},
fullTextFile = {.slirm_cache/Perrott_1979_A-Language.pdf},
notes = {High-level languages for parallel programming.},
review = {fbie: accepted <2016-01-13 13:37:16>},
}
@inproceedings{Kadayif:2002:ILP:513918.514096,
author = {Kadayif, I. and Kandemir, M. and Sezer, U.},
title = {An Integer Linear Programming Based Approach for
Parallelizing Applications in On-chip
Multiprocessors},
booktitle = {Proceedings of the 39th Annual Design Automation
Conference},
series = {DAC '02},
year = {2002},
isbn = {1-58113-461-4},
location = {New Orleans, Louisiana, USA},
pages = {703--706},
numpages = {4},
url = {http://doi.acm.org/10.1145/513918.514096},
doi = {10.1145/513918.514096},
acmid = {514096},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {constraint-based compilation, embedded systems,
loop-Level parallelism},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=514096&ftid=72110&dwn=1&CFID=744742186&CFTOKEN=77967707},
fullTextFile = {.slirm_cache/Kadayif_2002_An-Integer.pdf},
notes = {Automatic parallelization of array-intensive languages with multiple constraints (here performance and energy consumption).},
review = {fbie: accepted <2016-01-13 13:22:47>},
}
@inproceedings{Sastry:1994:PDU:182409.182486,
author = {Sastry, A. V. S. and Clinger, William},
title = {Parallel Destructive Updating in Strict Functional
Languages},
booktitle = {Proceedings of the 1994 ACM Conference on LISP and
Functional Programming},
series = {LFP '94},
year = {1994},
isbn = {0-89791-643-3},
location = {Orlando, Florida, USA},
pages = {263--272},
numpages = {10},
url = {http://doi.acm.org/10.1145/182409.182486},
doi = {10.1145/182409.182486},
acmid = {182486},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=182486&ftid=27882&dwn=1&CFID=744742186&CFTOKEN=77967707},
review = {fbie: accepted <2016-01-13 13:17:29>},
}
@article{Sastry:1994:PDU:182590.182486,
author = {Sastry, A. V. S. and Clinger, William},
title = {Parallel Destructive Updating in Strict Functional
Languages},
journal = {SIGPLAN Lisp Pointers},
issue_date = {July-Sept. 1994},
volume = {VII},
number = {3},
month = {jul},
year = {1994},
issn = {1045-3563},
pages = {263--272},
numpages = {10},
url = {http://doi.acm.org/10.1145/182590.182486},
doi = {10.1145/182590.182486},
acmid = {182486},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=182486&ftid=27882&dwn=1&CFID=574981389&CFTOKEN=13307282},
fullTextFile = {.slirm_cache/Sastry_1994_Parallel-Destructive.pdf},
notes = {Concerns array updates in functional languages. Author acknowledges that there has not been much research in this area and mentions amongst others SISAL as an exception.},
review = {fbie: accepted <2016-01-13 13:17:24>},
}
@inproceedings{Sinkarovs:2013:SDL:2502323.2502332,
author = {Sinkarovs, Artjoms and Scholz, Sven-Bodo},
title = {Semantics-preserving Data Layout Transformations for
Improved Vectorisation},
booktitle = {Proceedings of the 2Nd ACM SIGPLAN Workshop on
Functional High-performance Computing},
series = {FHPC '13},
year = {2013},
isbn = {978-1-4503-2381-9},
location = {Boston, Massachusetts, USA},
pages = {59--70},
numpages = {12},
url = {http://doi.acm.org/10.1145/2502323.2502332},
doi = {10.1145/2502323.2502332},
acmid = {2502332},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {correctness, program transformation, type systems,
vectorisation},
abstract = {Data-Layouts that are favourable from an algorithmic perspective often are less suitable for vectorisation, i.e., for an effective use of modern processor's vector instructions. This paper presents work on a compiler driven approach towards automatically transforming data layouts into a form that is suitable for vectorisation. In particular, we present a program transformation for a first-order functional array programming language that systematically modifies they layouts of all data structures. At the same time, the transformation also adjusts the code that operates on these structures so that the overall computation remains unchanged. We define a correctness criterion for layout modifying program transformations and we show that our transformation abides to this criterion.},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=2502332&ftid=1397477&dwn=1&CFID=574981389&CFTOKEN=13307282},
review = {fbie: accepted <2016-01-13 13:09:45>},
}
@inproceedings{Knobe:1998:ASF:268946.268956,
author = {Knobe, Kathleen and Sarkar, Vivek},
title = {Array SSA Form and Its Use in Parallelization},
booktitle = {Proceedings of the 25th ACM SIGPLAN-SIGACT Symposium
on Principles of Programming Languages},
series = {POPL '98},
year = {1998},
isbn = {0-89791-979-3},
location = {San Diego, California, USA},
pages = {107--120},
numpages = {14},
url = {http://doi.acm.org/10.1145/268946.268956},
doi = {10.1145/268946.268956},
acmid = {268956},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=268956&ftid=33536&dwn=1&CFID=574981389&CFTOKEN=13307282},
fullTextFile = {.slirm_cache/Knobe_1998_Array-SSA.pdf},
notes = {SSA stands for "single static assignment".},
review = {fbie: accepted <2016-01-13 13:04:58>},
}
@article{Arvind:1989:IDS:69558.69562,
author = {Arvind and Nikhil, Rishiyur S. and Pingali, Keshav
K.},
title = {I-structures: Data Structures for Parallel
Computing},
journal = {ACM Trans. Program. Lang. Syst.},
issue_date = {Oct. 1989},
volume = {11},
number = {4},
month = {oct},
year = {1989},
issn = {0164-0925},
pages = {598--632},
numpages = {35},
url = {http://doi.acm.org/10.1145/69558.69562},
doi = {10.1145/69558.69562},
acmid = {69562},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=69562&ftid=19522&dwn=1&CFID=574981389&CFTOKEN=13307282},
fullTextFile = {.slirm_cache/Arvind and Nikhil_1989_I-structures.pdf},
notes = {I am not quite sure about this one, but it seems an interesting paper.},
review = {fbie: accepted <2016-01-13 13:02:55>},
}
@article{Ching:1990:APA:97811.97826,
author = {Ching, Wai-Mee},
title = {Automatic Parallelization of APL-style Programs},
journal = {SIGAPL APL Quote Quad},
issue_date = {July 1990},
volume = {20},
number = {4},
month = {may},
year = {1990},
issn = {0163-6006},
pages = {76--80},
numpages = {5},
url = {http://doi.acm.org/10.1145/97811.97826},
doi = {10.1145/97811.97826},
acmid = {97826},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=97826&ftid=14719&dwn=1&CFID=574974113&CFTOKEN=15072837},
review = {fbie: accepted <2016-01-13 11:30:39>},
}
@inproceedings{Ching:1990:APA:97808.97826,
author = {Ching, Wai-Mee},
title = {Automatic Parallelization of APL-style Programs},
booktitle = {Conference Proceedings on APL 90: For the Future},
series = {APL '90},
year = {1990},
isbn = {0-89791-371-X},
location = {Copenhagen, Denmark},
pages = {76--80},
numpages = {5},
url = {http://doi.acm.org/10.1145/97808.97826},
doi = {10.1145/97808.97826},
acmid = {97826},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=97826&ftid=14719&dwn=1&CFID=574974113&CFTOKEN=15072837},
fullTextFile = {.slirm_cache/Ching_1990_Automatic.pdf},
review = {fbie: accepted <2016-01-13 11:30:32>},
}
@inproceedings{Maydan:1993:AFA:158511.158515,
author = {Maydan, Dror E. and Amarasinghe, Saman P. and Lam,
Monica S.},
title = {Array-data Flow Analysis and Its Use in Array
Privatization},
booktitle = {Proceedings of the 20th ACM SIGPLAN-SIGACT Symposium
on Principles of Programming Languages},
series = {POPL '93},
year = {1993},
isbn = {0-89791-560-7},
location = {Charleston, South Carolina, USA},
pages = {2--15},
numpages = {14},
url = {http://doi.acm.org/10.1145/158511.158515},
doi = {10.1145/158511.158515},
acmid = {158515},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=158515&ftid=33071&dwn=1&CFID=574974113&CFTOKEN=15072837},
fullTextFile = {.slirm_cache/Maydan_1993_Array.pdf},
notes = {A efficient algorithm for analyzing array accesses in nested loops via data-flow techniques.},
review = {fbie: accepted <2016-01-13 11:28:39>},
}
@article{Chakravarty:2001:FAF:507669.507661,
author = {Chakravarty, Manuel M. T. and Keller, Gabriele},
title = {Functional Array Fusion},
journal = {SIGPLAN Not.},
issue_date = {October 2001},
volume = {36},
number = {10},
month = {oct},
year = {2001},
issn = {0362-1340},
pages = {205--216},
numpages = {12},
url = {http://doi.acm.org/10.1145/507546.507661},
doi = {10.1145/507546.507661},
acmid = {507661},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=507661&ftid=69665&dwn=1&CFID=574974113&CFTOKEN=15072837},
fullTextFile = {.slirm_cache/Chakravarty_2001_Functional.pdf},
review = {fbie: accepted <2016-01-13 11:25:44>},
}
@inproceedings{Chakravarty:2001:FAF:507635.507661,
author = {Chakravarty, Manuel M. T. and Keller, Gabriele},
title = {Functional Array Fusion},
booktitle = {Proceedings of the Sixth ACM SIGPLAN International
Conference on Functional Programming},
series = {ICFP '01},
year = {2001},
isbn = {1-58113-415-0},
location = {Florence, Italy},
pages = {205--216},
numpages = {12},
url = {http://doi.acm.org/10.1145/507635.507661},
doi = {10.1145/507635.507661},
acmid = {507661},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=507661&ftid=69665&dwn=1&CFID=574965416&CFTOKEN=24274792},
fullTextFile = {.slirm_cache/Chakravarty_2001_Functional.pdf},
notes = {Describes loop fusion in the Haskell compiler, the abstract claims that it is geared towards parallelism.},
review = {fbie: accepted <2016-01-13 11:25:25>},
}
@article{Pugh:1998:CAD:291889.291900,
author = {Pugh, William and Wonnacott, David},
title = {Constraint-based Array Dependence Analysis},
journal = {ACM Trans. Program. Lang. Syst.},
issue_date = {May 1998},
volume = {20},
number = {3},
month = {may},
year = {1998},
issn = {0164-0925},
pages = {635--678},
numpages = {44},
url = {http://doi.acm.org/10.1145/291889.291900},
doi = {10.1145/291889.291900},
acmid = {291900},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {Presburger Arithmetic, array dataflow analysis,
dependence abstraction, dependence analysis,
parallelization, static analysis},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=291900&ftid=32245&dwn=1&CFID=574965416&CFTOKEN=24274792},
fullTextFile = {.slirm_cache/Pugh_1998_Constraint.pdf},
notes = {},
review = {fbie: accepted <2016-01-13 11:08:21>},
abstract = {Traditional array dependence analysis, which detects potential memory aliasing of array references is a key analysis technique for automatic parallelization. Recent studies of benchmark codes indicate that limitations of analysis cause many compilers to overlook large amounts of potential parallelism, and that exploiting this parallelism requires algorithms to answer new question about array references, not just get better answers to the old questions of aliasing. We need to ask about the flow of values in arrays, to check the legality of array privatization, and about the conditions under which a dependence exists, to obtain information about conditional parallelism. In some cases, we must answer these questions about code containing nonlinear terms in loop bounds or subscripts. This article describes techniques for phrasing these questions in terms of systems of contstraints. Conditional dependence analysis can be performed with a constraint operation we call the "gist" operation. When subscripts and loop bounds are affine, questions about the flow of values in array variables can be phrased in terms of Presburger Arithmetic. When the constraints describing a dependence are not affine, we introduce uninterpreted function symbols to represent the nonaffine terms. Our constraint language also provides a rich language for communication with the dependence analyzer, by either the programmer or other phases of the compiler. This article also documents our investigations of the praticality of our approach. The worst-case complexity of Presburger Arithmetic indicates that it might be unsuitable for any practical application. However, we have found that analysis of benchmark programs does not cause the exponential growth in the number of constraints that could occur in the worst case. We have studied the constraints produced during our aanalysis, and identified characteristics that keep our algorithms free of exponential behavior in practice.},
}
@inproceedings{Henriksen:2013:TGA:2502323.2502328,
author = {Henriksen, Troels and Oancea, Cosmin Eugen},
title = {A T2 Graph-reduction Approach to Fusion},
booktitle = {Proceedings of the 2Nd ACM SIGPLAN Workshop on
Functional High-performance Computing},
series = {FHPC '13},
year = {2013},
isbn = {978-1-4503-2381-9},
location = {Boston, Massachusetts, USA},
pages = {47--58},
numpages = {12},
url = {http://doi.acm.org/10.1145/2502323.2502328},
doi = {10.1145/2502323.2502328},
acmid = {2502328},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {autoparallelization, functional language, fusion},
abstract = {Fusion is one of the most important code
transformations as it has the potential to
substantially optimize both the memory hierarchy
time overhead and, sometimes asymptotically, the
space requirement. In functional languages, fusion
is naturally and relatively easily derived as a
producer-consumer relation between program
constructs that expose a richer, higher-order
algebra of program invariants, such as the
map-reduce list homomorphisms. In imperative
languages, fusing producer-consumer loops requires
dependency analysis on arrays applied at loop-nest
level. Such analysis, however, has often been
labeled as "heroic effort" and, if at all, is
supported only in its simplest and most conservative
form in industrial compilers. Related
implementations in the functional context typically
apply fusion only when the to-be-fused producer is
used exactly once, i.e., in the consumer. This
guarantees that the transformation is conservative:
the resulting program does not duplicate
computation. We show that the above restriction is
more conservative than needed, and present a
structural-analysis technique, inspired from the
T1--T2 transformation for reducible data flow, that
enables fusion even in some cases when the producer
is used in different consumers and without
duplicating computation. We report an implementation
of the fusion algorithm for a functional-core
language, named L0, which is intended to support
nested parallelism across regular multi-dimensional
arrays. We succinctly describe L0's semantics and
the compiler infrastructure on which the fusion
transformation relies, and present
compiler-generated statistics related to fusion on a
set of six benchmarks.},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=2502328&ftid=1397476&dwn=1&CFID=574773947&CFTOKEN=83431304},
review = {fbie: accepted <2016-01-12 16:55:42>},
}
@inproceedings{Stucki:2015:RVP:2784731.2784739,
author = {Stucki, Nicolas and Rompf, Tiark and Ureche, Vlad
and Bagwell, Phil},
title = {RRB Vector: A Practical General Purpose Immutable
Sequence},
booktitle = {Proceedings of the 20th ACM SIGPLAN International
Conference on Functional Programming},
series = {ICFP 2015},
year = {2015},
isbn = {978-1-4503-3669-7},
location = {Vancouver, BC, Canada},
pages = {342--354},
numpages = {13},
url = {http://doi.acm.org/10.1145/2784731.2784739},
doi = {10.1145/2784731.2784739},
acmid = {2784739},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {Arrays, Data Structures, Immutable, Radix-Balanced,
Relaxed-Radix-Balanced, Sequences, Trees, Vectors},
abstract = { State-of-the-art immutable collections have wildly
differing performance characteristics across their
operations, often forcing programmers to choose
different collection implementations for each
task. Thus, changes to the program can invalidate
the choice of collections, making code evolution
costly. It would be desirable to have a collection
that performs well for a broad range of
operations. To this end, we present the RRB-Vector,
an immutable sequence collection that offers good
performance across a large number of sequential and
parallel operations. The underlying innovations are:
(1) the Relaxed-Radix-Balanced (RRB) tree structure,
which allows efficient structural reorganization,
and (2) an optimization that exploits
spatio-temporal locality on the RRB data structure
in order to offset the cost of traversing the
tree. In our benchmarks, the RRB-Vector speedup for
parallel operations is lower bounded by 7x when
executing on 4 CPUs of 8 cores each. The performance
for discrete operations, such as appending on either
end, or updating and removing elements, is
consistently good and compares favorably to the most
important immutable sequence collections in the
literature and in use today. The memory footprint of
RRB-Vector is on par with arrays and an order of
magnitude less than competing collections. },
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=2784739&ftid=1616034&dwn=1&CFID=574773947&CFTOKEN=83431304},
review = {fbie: accepted <2016-01-12 16:53:43>},
}
@article{Stucki:2015:RVP:2858949.2784739,
author = {Stucki, Nicolas and Rompf, Tiark and Ureche, Vlad
and Bagwell, Phil},
title = {RRB Vector: A Practical General Purpose Immutable
Sequence},
journal = {SIGPLAN Not.},
issue_date = {September 2015},
volume = {50},
number = {9},
month = {aug},
year = {2015},
issn = {0362-1340},
pages = {342--354},
numpages = {13},
url = {http://doi.acm.org/10.1145/2858949.2784739},
doi = {10.1145/2858949.2784739},
acmid = {2784739},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {Arrays, Data Structures, Immutable, Radix-Balanced,
Relaxed-Radix-Balanced, Sequences, Trees, Vectors},
abstract = { State-of-the-art immutable collections have wildly
differing performance characteristics across their
operations, often forcing programmers to choose
different collection implementations for each
task. Thus, changes to the program can invalidate
the choice of collections, making code evolution
costly. It would be desirable to have a collection
that performs well for a broad range of
operations. To this end, we present the RRB-Vector,
an immutable sequence collection that offers good
performance across a large number of sequential and
parallel operations. The underlying innovations are:
(1) the Relaxed-Radix-Balanced (RRB) tree structure,
which allows efficient structural reorganization,
and (2) an optimization that exploits
spatio-temporal locality on the RRB data structure
in order to offset the cost of traversing the
tree. In our benchmarks, the RRB-Vector speedup for
parallel operations is lower bounded by 7x when
executing on 4 CPUs of 8 cores each. The performance
for discrete operations, such as appending on either
end, or updating and removing elements, is
consistently good and compares favorably to the most
important immutable sequence collections in the
literature and in use today. The memory footprint of
RRB-Vector is on par with arrays and an order of
magnitude less than competing collections. },
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=2784739&ftid=1616034&dwn=1&CFID=574773947&CFTOKEN=83431304},
review = {fbie: accepted <2016-01-12 16:53:40>},
}
@inproceedings{Hwang:1995:AOS:209936.209949,
author = {Hwang, Gwan-Hwan and Lee, Jenq Kuen and Ju,
Dz-Ching},
title = {An Array Operation Synthesis Scheme to Optimize
Fortran 90 Programs},
booktitle = {Proceedings of the Fifth ACM SIGPLAN Symposium on
Principles and Practice of Parallel Programming},
series = {PPOPP '95},
year = {1995},
isbn = {0-89791-700-6},
location = {Santa Barbara, California, USA},
pages = {112--122},
numpages = {11},
url = {http://doi.acm.org/10.1145/209936.209949},
doi = {10.1145/209936.209949},
acmid = {209949},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=209949&ftid=47044&dwn=1&CFID=574773947&CFTOKEN=83431304},
review = {fbie: accepted <2016-01-12 16:52:24>},
abstract = {An increasing number of programming languages, such as Fortran 90 and APL, are providing a rich set of intrinsic array functions and array expressions. These constructs which constitute an important part of data parallel languages provide excellent opportunities for compiler optimizations. In this paper, we present a new approach to combine consecutive data access patterns of array constructs into a composite access function to the source arrays. Our scheme is based on the composition of access functions, which is similar to a composition of mathematic functions. Our new scheme can handle not only data movements of arrays of different numbers of dimensions and segmented array operations but also masked array expressions and multiple sources array operations. As a result, our proposed scheme is the first synthesis scheme which can synthesize Fortran 90 RESHAPE, EOSHIFT, MERGE, and WHERE constructs together. Experimental results show speedups from 1.21 to 2.95 for code fragments from real applications on a Sequent multiprocessor machine by incorporating the proposed optimizations.},
}
@article{Hwang:1995:AOS:209937.209949,
author = {Hwang, Gwan-Hwan and Lee, Jenq Kuen and Ju,
Dz-Ching},
title = {An Array Operation Synthesis Scheme to Optimize
Fortran 90 Programs},
journal = {SIGPLAN Not.},
issue_date = {Aug. 1995},
volume = {30},
number = {8},
month = {aug},
year = {1995},
issn = {0362-1340},
pages = {112--122},
numpages = {11},
url = {http://doi.acm.org/10.1145/209937.209949},
doi = {10.1145/209937.209949},
acmid = {209949},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=209949&ftid=47044&dwn=1&CFID=574773947&CFTOKEN=83431304},
fullTextFile = {.slirm_cache/Hwang_1995_An.pdf},
notes = {Optimizing array accesses in Fortran.},
review = {fbie: accepted <2016-01-12 16:52:18>},
abstract = {An increasing number of programming languages, such as Fortran 90 and APL, are providing a rich set of intrinsic array functions and array expressions. These constructs which constitute an important part of data parallel languages provide excellent opportunities for compiler optimizations. In this paper, we present a new approach to combine consecutive data access patterns of array constructs into a composite access function to the source arrays. Our scheme is based on the composition of access functions, which is similar to a composition of mathematic functions. Our new scheme can handle not only data movements of arrays of different numbers of dimensions and segmented array operations but also masked array expressions and multiple sources array operations. As a result, our proposed scheme is the first synthesis scheme which can synthesize Fortran 90 RESHAPE, EOSHIFT, MERGE, and WHERE constructs together. Experimental results show speedups from 1.21 to 2.95 for code fragments from real applications on a Sequent multiprocessor machine by incorporating the proposed optimizations.},
}
@inproceedings{Henriksen:2014:BCI:2627373.2627388,
author = {Henriksen, Troels and Oancea, Cosmin E.},
title = {Bounds Checking: An Instance of Hybrid Analysis},
booktitle = {Proceedings of ACM SIGPLAN International Workshop on
Libraries, Languages, and Compilers for Array
Programming},
series = {ARRAY'14},
year = {2014},
isbn = {978-1-4503-2937-8},
location = {Edinburgh, United Kingdom},
pages = {88:88--88:94},
articleno = {88},
numpages = {7},
url = {http://doi.acm.org/10.1145/2627373.2627388},
doi = {10.1145/2627373.2627388},
acmid = {2627388},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {autoparallelization, functional language, subscripts
bounds checking},
abstract = {This paper presents an analysis for bounds checking
of array subscripts that lifts checking assertions
to program level under the form of an
arbitrarily-complex predicate (inspector), whose
runtime evaluation guards the execution of the code
of interest. Separating the predicate from the
computation makes it more amenable to optimization,
and allows it to be split into a cascade of
sufficient conditions of increasing complexity that
optimizes the common-inspection path. While
synthesizing the bounds checking invariant resembles
type checking techniques, we rely on compiler
simplification and runtime evaluation rather than
employing complex inference and annotation systems
that might discourage the non-specialist user. We
integrate the analysis in the compiler's repertoire
of Futhark: a purely-functional core language
supporting map-reduce nested parallelism on regular
arrays, and show how the high-level language
invariants enable a relatively straightforward
analysis. Finally, we report a qualitative
evaluation of our technique on three real-world
applications from the financial domain that
indicates that the runtime overhead of predicates is
negligible.},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=2627388&ftid=1503139&dwn=1&CFID=574773947&CFTOKEN=83431304},
review = {fbie: accepted <2016-01-12 16:42:34>},
}
@inproceedings{Walinsky:1990:FPL:91556.91610,
author = {Walinsky, Clifford and Banerjee, Deb},
title = {A Functional Programming Language Compiler for
Massively Parallel Computers},
booktitle = {Proceedings of the 1990 ACM Conference on LISP and
Functional Programming},
series = {LFP '90},
year = {1990},
isbn = {0-89791-368-X},
location = {Nice, France},
pages = {131--138},
numpages = {8},
url = {http://doi.acm.org/10.1145/91556.91610},
doi = {10.1145/91556.91610},
acmid = {91610},
publisher = {ACM},
address = {New York, NY, USA},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=91610&ftid=34538&dwn=1&CFID=574773947&CFTOKEN=83431304},
fullTextFile = {.slirm_cache/Walinsky_1990_A.pdf},
notes = {High-level parallelism in the FP language.},
review = {fbie: accepted <2016-01-12 16:41:13>},
abstract = {Functional programming languages remove programmers from low-level machine details, an important achievement when programming massively parallel systems. We present an overview of an FP compiler that generates programs capable of exploiting data-parallelism, a view of parallelism where distinct data elements reside on distinct processors and all processors execute a single instruction stream. To achieve this form of parallelism, FP's sequences are represented as arrays. This representation makes possible optimization techniques developed for APL compilers that compose routing functions at compile-time. These techniques are described succinctly by a set of axioms and inference rules. We demonstrate the optimizations by compiling several FP functions, obtaining optimal performance.},
}
@inproceedings{Bernecky:2015:AEP:2774959.2774962,
author = {Bernecky, Robert and Scholz, Sven-Bodo},
title = {Abstract Expressionism for Parallel Performance},
booktitle = {Proceedings of the 2Nd ACM SIGPLAN International
Workshop on Libraries, Languages, and Compilers for
Array Programming},
series = {ARRAY 2015},
year = {2015},
isbn = {978-1-4503-3584-3},
location = {Portland, OR, USA},
pages = {54--59},
numpages = {6},
url = {http://doi.acm.org/10.1145/2774959.2774962},
doi = {10.1145/2774959.2774962},
acmid = {2774962},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {APL, HPC, SAC, algorithms, expressiveness,
functional array languages, parallelism,
readability},
abstract = { Programming with abstract, mathematical expressions
offers benefits including terser programs, easier
communication of algorithms, ability to prove
theorems about algorithms, increased parallelism,
and improved programming productivity. Common belief
is that higher levels of abstraction imply a larger
semantic gap between the user and computer and,
therefore, typically slower execution, whether
sequential or parallel. In recent years,
domain-specific languages have been shown to close
this gap through sophisticated optimizations
benefitting from domain-specific knowledge. In this
paper, we demonstrate that the semantic gap can also
be closed for non-domain-specific functional array
languages, without requiring embedding of
language-specific semantic knowledge into the
compiler tool chain. We present a simple example of
APL-style programs, compiled into C-code that
outperform equivalent C programs in both sequential
and parallel (OpenMP) environments. We offer
insights into abstract expressionist programming, by
comparing the characteristics and performance of a
numerical relaxation benchmark written in C99, C99
with OpenMP directives, scheduling code, and
pragmas, and in , a functional array language. We
compare three algorithmic styles: if/then/else,
hand-optimized loop splitting, and an abstract,
functional style whose roots lie in APL. We show
that the algorithms match or outperform serial C,
and that the hand-optimized and abstract styles
generate identical code, and so have identical
performance. Furthermore, parallel variants also
outperform the best OpenMP C variant by up to a
third, with no source code modifications. Preserving
an algorithm's abstract expression during
optimization opens the door to generation of
radically different code for different
architectures. [The author list is wrong, but I see
no way to correct, despite the fact that EasyChair
has the correct author list.] },
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=2774962&ftid=1589049&dwn=1&CFID=574773947&CFTOKEN=83431304},
review = {fbie: accepted <2016-01-12 16:38:31>},
}
@inproceedings{Fumero:2014:CAF:2627373.2627381,
author = {Fumero, Juan Jos{\'e} and Steuwer, Michel and
Dubach, Christophe},
title = {A Composable Array Function Interface for
Heterogeneous Computing in Java},
booktitle = {Proceedings of ACM SIGPLAN International Workshop on
Libraries, Languages, and Compilers for Array
Programming},
series = {ARRAY'14},
year = {2014},
isbn = {978-1-4503-2937-8},
location = {Edinburgh, United Kingdom},
pages = {44:44--44:49},
articleno = {44},
numpages = {6},
url = {http://doi.acm.org/10.1145/2627373.2627381},
doi = {10.1145/2627373.2627381},
acmid = {2627381},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {Array programming, GPGPU, Patterns},
abstract = {Heterogeneous computing has now become mainstream
with virtually every desktop machines featuring
accelerators such as Graphics Processing Units
(GPUs). While heterogeneity offers the promise of
high-performance and high-efficiency, it comes at
the cost of huge programming difficulties. Languages
and interfaces for programming such system tend to
be low-level and require expert knowledge of the
hardware in order to achieve its potential. A
promising approach for programming such
heterogeneous systems is the use of array
programming. This style of programming relies on
well known parallel patterns that can be easily
translated into GPU or other accelerator
code. However, only little work has been done on
integrating such concepts in mainstream languages
such as Java. In this work, we propose a new Array
Function interface implemented with the new features
from Java 8. While similar in spirit to the new
Stream API of Java, our API follows a different
design based on reusability and composability. We
demonstrate that this API can be used to generate
OpenCL code for a simple application. We present
encouraging preliminary performance results showing
the potential of our approach.},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=2627381&ftid=1503132&dwn=1&CFID=574773947&CFTOKEN=83431304},
review = {fbie: accepted <2016-01-12 16:34:03>},
}
@inproceedings{Fluet:2008:SFG:1411204.1411239,
author = {Fluet, Matthew and Rainey, Mike and Reppy, John},
title = {A Scheduling Framework for General-purpose Parallel
Languages},
booktitle = {Proceedings of the 13th ACM SIGPLAN International
Conference on Functional Programming},
series = {ICFP '08},
year = {2008},
isbn = {978-1-59593-919-7},
location = {Victoria, BC, Canada},
pages = {241--252},
numpages = {12},
url = {http://doi.acm.org/10.1145/1411204.1411239},
doi = {10.1145/1411204.1411239},
acmid = {1411239},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {compilers, heterogeneous parallel languages,
run-time systems, scheduling},
abstract = {The trend in microprocessor design toward multicore
and manycore processors means that future
performance gains in software will largely come from
harnessing parallelism. To realize such gains, we
need languages and implementations that can enable
parallelism at many different levels. For example,
an application might use both explicit threads to
implement course-grain parallelism for independent
tasks and implicit threads for fine-grain
data-parallel computation over a large array. An
important aspect of this requirement is supporting a
wide range of different scheduling mechanisms for
parallel computation. In this paper, we describe the
scheduling framework that we have designed and
implemented for Manticore, a strict parallel
functional language. We take a micro-kernel approach
in our design: the compiler and runtime support a
small collection of scheduling primitives upon which
complex scheduling policies can be implemented. This
framework is extremely flexible and can support a
wide range of different scheduling policies. It also
supports the nesting of schedulers, which is key to
both supporting multiple scheduling policies in the
same application and to hierarchies of speculative
parallel computations. In addition to describing our
framework, we also illustrate its expressiveness
with several popular scheduling techniques. We
present a (mostly) modular approach to extending our
schedulers to support cancellation. This mechanism
is essential for implementing eager and speculative
parallelism. We finally evaluate our framework with
a series of benchmarks and an analysis.},
fullTextUrl = {http://dl.acm.org/ft_gateway.cfm?id=1411239&ftid=551291&dwn=1&CFID=574773947&CFTOKEN=83431304},
review = {fbie: accepted <2016-01-12 16:33:06>},
}
@article{Fluet:2008:SFG:1411203.1411239,
author = {Fluet, Matthew and Rainey, Mike and Reppy, John},
title = {A Scheduling Framework for General-purpose Parallel
Languages},
journal = {SIGPLAN Not.},
issue_date = {September 2008},
volume = {43},
number = {9},
month = {sep},
year = {2008},
issn = {0362-1340},
pages = {241--252},
numpages = {12},
url = {http://doi.acm.org/10.1145/1411203.1411239},
doi = {10.1145/1411203.1411239},
acmid = {1411239},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {compilers, heterogeneous parallel languages,
run-time systems, scheduling},
abstract = {The trend in microprocessor design toward multicore
and manycore processors means that future
performance gains in software will largely come from
harnessing parallelism. To realize such gains, we
need languages and implementations that can enable
parallelism at many different levels. For example,
an application might use both explicit threads to