-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharray-programming-acm.bib
8016 lines (7669 loc) · 530 KB
/
array-programming-acm.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@inproceedings{Collins:2014:NFL:2627373.2627375,
author = {Collins, Alexander and Grewe, Dominik and Grover,
Vinod and Lee, Sean and Susnea, Adriana},
title = {NOVA: A Functional Language for Data Parallelism},
booktitle = {Proceedings of ACM SIGPLAN International Workshop on
Libraries, Languages, and Compilers for Array
Programming},
series = {ARRAY'14},
year = 2014,
isbn = {978-1-4503-2937-8},
location = {Edinburgh, United Kingdom},
pages = {8:8--8:13},
articleno = 8,
numpages = 6,
url = {http://doi.acm.org/10.1145/2627373.2627375},
doi = {10.1145/2627373.2627375},
acmid = 2627375,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {Array-oriented programming, CUDA, Code generation,
Compilation, Functional programming, Multi-core CPU},
abstract = {Functional languages provide a solid foundation on
which complex optimization passes can be designed to
exploit parallelism available in the underlying
system. Their mathematical foundations enable
high-level optimizations that would be impossible in
traditional imperative languages. This makes them
uniquely suited for generation of efficient target
code for parallel systems, such as multiple Central
Processing Units (CPUs) or highly data-parallel
Graphics Processing Units (GPUs). Such systems are
becoming the mainstream for scientific and commodity
desktop computing. Writing performance portable code
for such systems using low-level languages requires
significant effort from a human expert. This paper
presents NOVA, a functional language and compiler
for multi-core CPUs and GPUs. The NOVA language is a
polymorphic, statically-typed functional language
with a suite of higher-order functions which are
used to express parallelism. These include map,
reduce and scan. The NOVA compiler is a
light-weight, yet powerful, optimizing compiler. It
generates code for a variety of target platforms
that achieve performance comparable to competing
languages and tools, including hand-optimized
code. The NOVA compiler is stand-alone and can be
easily used as a target for higher-level or domain
specific languages or embedded in other
applications. We evaluate NOVA against two competing
approaches: the Thrust library and hand-written CUDA
C. NOVA achieves comparable performance to these
approaches across a range of
benchmarks. NOVA-generated code also scales linearly
with the number of processor cores across all
compute-bound benchmarks.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2627375&ftid=1503126&dwn=1&CFID=574747772&CFTOKEN=80047865},
review = {fbie: accepted <2016-01-12 15:08:12>},
}
@article{Keller:2010:RSP:1932681.1863582,
author = {Keller, Gabriele and Chakravarty, Manuel M.T. and
Leshchinskiy, Roman and Peyton Jones, Simon and
Lippmeier, Ben},
title = {Regular, Shape-polymorphic, Parallel Arrays in
Haskell},
journal = {SIGPLAN Not.},
issue_date = {September 2010},
volume = 45,
number = 9,
month = sep,
year = 2010,
issn = {0362-1340},
pages = {261--272},
numpages = 12,
url = {http://doi.acm.org/10.1145/1932681.1863582},
doi = {10.1145/1932681.1863582},
acmid = 1863582,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {arrays, data parallelism, haskell},
abstract = {We present a novel approach to regular,
multi-dimensional arrays in Haskell. The main
highlights of our approach are that it (1) is purely
functional, (2) supports reuse through shape
polymorphism, (3) avoids unnecessary intermediate
structures rather than relying on subsequent loop
fusion, and (4) supports transparent
parallelisation. We show how to embed two forms of
shape polymorphism into Haskell's type system using
type classes and type families. In particular, we
discuss the generalisation of regular array
transformations to arrays of higher rank, and
introduce a type-safe specification of array
slices. We discuss the runtime performance of our
approach for three standard array algorithms. We
achieve absolute performance comparable to
handwritten C code. At the same time, our
implementation scales well up to 8 processor cores.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=1863582&ftid=845214&dwn=1&CFID=574747772&CFTOKEN=80047865},
review = {fbie: accepted <2016-01-12 15:08:58>},
}
@inproceedings{Keller:2010:RSP:1863543.1863582,
author = {Keller, Gabriele and Chakravarty, Manuel M.T. and
Leshchinskiy, Roman and Peyton Jones, Simon and
Lippmeier, Ben},
title = {Regular, Shape-polymorphic, Parallel Arrays in
Haskell},
booktitle = {Proceedings of the 15th ACM SIGPLAN International
Conference on Functional Programming},
series = {ICFP '10},
year = 2010,
isbn = {978-1-60558-794-3},
location = {Baltimore, Maryland, USA},
pages = {261--272},
numpages = 12,
url = {http://doi.acm.org/10.1145/1863543.1863582},
doi = {10.1145/1863543.1863582},
acmid = 1863582,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {arrays, data parallelism, haskell},
abstract = {We present a novel approach to regular,
multi-dimensional arrays in Haskell. The main
highlights of our approach are that it (1) is purely
functional, (2) supports reuse through shape
polymorphism, (3) avoids unnecessary intermediate
structures rather than relying on subsequent loop
fusion, and (4) supports transparent
parallelisation. We show how to embed two forms of
shape polymorphism into Haskell's type system using
type classes and type families. In particular, we
discuss the generalisation of regular array
transformations to arrays of higher rank, and
introduce a type-safe specification of array
slices. We discuss the runtime performance of our
approach for three standard array algorithms. We
achieve absolute performance comparable to
handwritten C code. At the same time, our
implementation scales well up to 8 processor cores.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=1863582&ftid=845214&dwn=1&CFID=574747772&CFTOKEN=80047865},
review = {fbie: accepted <2016-01-12 15:09:02>},
}
@article{McKenney:1992:GPC:130616.130622,
author = {McKenney, Bruce and Szymanski, Boleslaw K.},
title = {Generating Parallel Code for SIMD Machines},
journal = {ACM Lett. Program. Lang. Syst.},
issue_date = {March 1992},
volume = 1,
number = 1,
month = mar,
year = 1992,
issn = {1057-4514},
pages = {59--73},
numpages = 15,
url = {http://doi.acm.org/10.1145/130616.130622},
doi = {10.1145/130616.130622},
acmid = 130622,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {data parallelism},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=130622&ftid=32971&dwn=1&CFID=574747772&CFTOKEN=80047865},
abstract = {
Massively parallel SIMD machines rely on data parallelism usually achieved by a careful hand coding to support program efficiency. This paper describes parallelization of code generated for SIMD machines by the compiler for the Equational Programming Language, EPL. The language supports architecture-independent scientific programming by recurrent equations. The EPL compiler serves as a programming aid for users of parallel machines by automating data partitioning and computation parallelization based on inherent data dependencies. In support of a Connection Machine architecture, the EPL compiler performs horizontal partitioning of the program, a process that selects a dimension of each data structure to be projected along the processor array. Each processor then holds a single instance of that structure and operations along the projected dimension are done in parallel. The paper describes horizontal partitioning, code generation in MPL and efficiency of programs generated for Maspar SIMD machine. },
review = {fbie: rejected <2016-01-14 12:09:51>},
}
@inproceedings{Lippmeier:2012:WEH:2364527.2364564,
author = {Lippmeier, Ben and Chakravarty, Manuel M.T. and
Keller, Gabriele and Leshchinskiy, Roman and Peyton
Jones, Simon},
title = {Work Efficient Higher-order Vectorisation},
booktitle = {Proceedings of the 17th ACM SIGPLAN International
Conference on Functional Programming},
series = {ICFP '12},
year = 2012,
isbn = {978-1-4503-1054-3},
location = {Copenhagen, Denmark},
pages = {259--270},
numpages = 12,
url = {http://doi.acm.org/10.1145/2364527.2364564},
doi = {10.1145/2364527.2364564},
acmid = 2364564,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {arrays, data parallelism, haskell},
abstract = {Existing approaches to higher-order vectorisation,
also known as flattening nested data parallelism, do
not preserve the asymptotic work complexity of the
source program. Straightforward examples, such as
sparse matrix-vector multiplication, can suffer a
severe blow-up in both time and space, which limits
the practicality of this method. We discuss why this
problem arises, identify the mis-handling of index
space transforms as the root cause, and present a
solution using a refined representation of nested
arrays. We have implemented this solution in Data
Parallel Haskell (DPH) and present benchmarks
showing that realistic programs, which used to
suffer the blow-up, now have the correct asymptotic
work complexity. In some cases, the asymptotic
complexity of the vectorised program is even better
than the original.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2364564&ftid=1282930&dwn=1&CFID=574747772&CFTOKEN=80047865},
review = {fbie: accepted <2016-01-12 15:09:45>},
}
@article{Lippmeier:2012:WEH:2398856.2364564,
author = {Lippmeier, Ben and Chakravarty, Manuel M.T. and
Keller, Gabriele and Leshchinskiy, Roman and Peyton
Jones, Simon},
title = {Work Efficient Higher-order Vectorisation},
journal = {SIGPLAN Not.},
issue_date = {September 2012},
volume = 47,
number = 9,
month = sep,
year = 2012,
issn = {0362-1340},
pages = {259--270},
numpages = 12,
url = {http://doi.acm.org/10.1145/2398856.2364564},
doi = {10.1145/2398856.2364564},
acmid = 2364564,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {arrays, data parallelism, haskell},
abstract = {Existing approaches to higher-order vectorisation,
also known as flattening nested data parallelism, do
not preserve the asymptotic work complexity of the
source program. Straightforward examples, such as
sparse matrix-vector multiplication, can suffer a
severe blow-up in both time and space, which limits
the practicality of this method. We discuss why this
problem arises, identify the mis-handling of index
space transforms as the root cause, and present a
solution using a refined representation of nested
arrays. We have implemented this solution in Data
Parallel Haskell (DPH) and present benchmarks
showing that realistic programs, which used to
suffer the blow-up, now have the correct asymptotic
work complexity. In some cases, the asymptotic
complexity of the vectorised program is even better
than the original.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2364564&ftid=1282930&dwn=1&CFID=574762219&CFTOKEN=10899110},
review = {fbie: accepted <2016-01-12 15:09:53>},
}
@inproceedings{Chakravarty:2013:DPH:2502323.2508151,
author = {Chakravarty, Manuel M.T.},
title = {Data Parallelism in Haskell},
booktitle = {Proceedings of the 2Nd ACM SIGPLAN Workshop on
Functional High-performance Computing},
series = {FHPC '13},
year = 2013,
isbn = {978-1-4503-2381-9},
location = {Boston, Massachusetts, USA},
pages = {97--98},
numpages = 2,
url = {http://doi.acm.org/10.1145/2502323.2508151},
doi = {10.1145/2502323.2508151},
acmid = 2508151,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {array programming, code optimisation, data
parallelism, haskell},
abstract = {The implicit data parallelism in collective
operations on aggregate data structures constitutes
an attractive parallel programming model for
functional languages. Beginning with our work on
integrating nested data parallelism into Haskell, we
explored a variety of different approaches to
array-centric data parallel programming in Haskell,
experimented with a range of code generation and
optimisation strategies, and targeted both multicore
CPUs and GPUs. In addition to practical tools for
parallel programming, the outcomes of this research
programme include more widely applicable concepts,
such as Haskell's type families and stream
fusion. In this talk, I will contrast the different
approaches to data parallel programming that we
explored. I will discuss their strengths and
weaknesses and review what we have learnt in the
course of exploring the various options. This
includes our experience of implementing these
approaches in the Glasgow Haskell Compiler as well
the experimental results that we have gathered so
far. Finally, I will outline the remaining open
challenges and our plans for the future. This talk
is based on joint work with Gabriele Keller, Sean
Lee, Roman Leshchinskiy, Ben Lippmeier, Trevor
L. McDonell, and Simon Peyton Jones.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2508151&ftid=1397480&dwn=1&CFID=574762219&CFTOKEN=10899110},
fullTextFile = {.slirm_cache/Chakravarty_2013_Data.pdf},
notes = {This is a talk, the paper consists only of the
abstract.},
review = {fbie: accepted <2016-01-12 15:12:05>},
}
@inproceedings{Chakravarty:2011:AHA:1926354.1926358,
author = {Chakravarty, Manuel M.T. and Keller, Gabriele and
Lee, Sean and McDonell, Trevor L. and Grover, Vinod},
title = {Accelerating Haskell Array Codes with Multicore
GPUs},
booktitle = {Proceedings of the Sixth Workshop on Declarative
Aspects of Multicore Programming},
series = {DAMP '11},
year = 2011,
isbn = {978-1-4503-0486-3},
location = {Austin, Texas, USA},
pages = {3--14},
numpages = 12,
url = {http://doi.acm.org/10.1145/1926354.1926358},
doi = {10.1145/1926354.1926358},
acmid = 1926358,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {arrays, data parallelism, dynamic compilation,
gpgpu, haskell, skeletons},
abstract = {Current GPUs are massively parallel multicore
processors optimised for workloads with a large
degree of SIMD parallelism. Good performance
requires highly idiomatic programs, whose
development is work intensive and requires expert
knowledge. To raise the level of abstraction, we
propose a domain-specific high-level language of
array computations that captures appropriate idioms
in the form of collective array operations. We embed
this purely functional array language in Haskell
with an online code generator for NVIDIA's CUDA
GPGPU programming environment. We regard the
embedded language's collective array operations as
algorithmic skeletons; our code generator
instantiates CUDA implementations of those skeletons
to execute embedded array programs. This paper
outlines our embedding in Haskell, details the
design and implementation of the dynamic code
generator, and reports on initial benchmark
results. These results suggest that we can compete
with moderately optimised native CUDA code, while
enabling much simpler source programs.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=1926358&ftid=907606&dwn=1&CFID=574762219&CFTOKEN=10899110},
review = {fbie: rejected <2016-01-12 15:13:14>},
}
@article{Lippmeier:2012:GPA:2430532.2364511,
author = {Lippmeier, Ben and Chakravarty, Manuel and Keller,
Gabriele and Peyton Jones, Simon},
title = {Guiding Parallel Array Fusion with Indexed Types},
journal = {SIGPLAN Not.},
issue_date = {December 2012},
volume = 47,
number = 12,
month = sep,
year = 2012,
issn = {0362-1340},
pages = {25--36},
numpages = 12,
url = {http://doi.acm.org/10.1145/2430532.2364511},
doi = {10.1145/2430532.2364511},
acmid = 2364511,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {arrays, data parallelism, haskell},
abstract = {We present a refined approach to parallel array
fusion that uses indexed types to specify the
internal representation of each array. Our approach
aids the client programmer in reasoning about the
performance of their program in terms of the source
code. It also makes the intermediate code easier to
transform at compile-time, resulting in faster
compilation and more reliable runtimes. We
demonstrate how our new approach improves both the
clarity and performance of several end-user written
programs, including a fluid flow solver and an
interpolator for volumetric data.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2364511&ftid=1282872&dwn=1&CFID=574762219&CFTOKEN=10899110},
notes = {Not quite sure, will accept it tentatively.},
review = {fbie: accepted <2016-01-12 15:14:07>},
}
@inproceedings{Lippmeier:2012:GPA:2364506.2364511,
author = {Lippmeier, Ben and Chakravarty, Manuel and Keller,
Gabriele and Peyton Jones, Simon},
title = {Guiding Parallel Array Fusion with Indexed Types},
booktitle = {Proceedings of the 2012 Haskell Symposium},
series = {Haskell '12},
year = 2012,
isbn = {978-1-4503-1574-6},
location = {Copenhagen, Denmark},
pages = {25--36},
numpages = 12,
url = {http://doi.acm.org/10.1145/2364506.2364511},
doi = {10.1145/2364506.2364511},
acmid = 2364511,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {arrays, data parallelism, haskell},
abstract = {We present a refined approach to parallel array
fusion that uses indexed types to specify the
internal representation of each array. Our approach
aids the client programmer in reasoning about the
performance of their program in terms of the source
code. It also makes the intermediate code easier to
transform at compile-time, resulting in faster
compilation and more reliable runtimes. We
demonstrate how our new approach improves both the
clarity and performance of several end-user written
programs, including a fluid flow solver and an
interpolator for volumetric data.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2364511&ftid=1282872&dwn=1&CFID=574762219&CFTOKEN=10899110},
review = {fbie: accepted <2016-01-12 15:14:17>},
}
@article{Lippmeier:2011:EPS:2096148.2034684,
author = {Lippmeier, Ben and Keller, Gabriele},
title = {Efficient Parallel Stencil Convolution in Haskell},
journal = {SIGPLAN Not.},
issue_date = {December 2011},
volume = 46,
number = 12,
month = sep,
year = 2011,
issn = {0362-1340},
pages = {59--70},
numpages = 12,
url = {http://doi.acm.org/10.1145/2096148.2034684},
doi = {10.1145/2096148.2034684},
acmid = 2034684,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {arrays, data parallelism, haskell},
abstract = {Stencil convolution is a fundamental building block
of many scientific and image processing
algorithms. We present a declarative approach to
writing such convolutions in Haskell that is both
efficient at runtime and implicitly parallel. To
achieve this we extend our prior work on the Repa
array library with two new features: partitioned and
cursored arrays. Combined with careful management of
the interaction between GHC and its back-end code
generator LLVM, we achieve performance comparable to
the standard OpenCV library.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2034684&ftid=1035453&dwn=1&CFID=574762219&CFTOKEN=10899110},
review = {fbie: rejected <2016-01-12 15:15:13>},
}
@inproceedings{Lippmeier:2011:EPS:2034675.2034684,
author = {Lippmeier, Ben and Keller, Gabriele},
title = {Efficient Parallel Stencil Convolution in Haskell},
booktitle = {Proceedings of the 4th ACM Symposium on Haskell},
series = {Haskell '11},
year = 2011,
isbn = {978-1-4503-0860-1},
location = {Tokyo, Japan},
pages = {59--70},
numpages = 12,
url = {http://doi.acm.org/10.1145/2034675.2034684},
doi = {10.1145/2034675.2034684},
acmid = 2034684,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {arrays, data parallelism, haskell},
abstract = {Stencil convolution is a fundamental building block
of many scientific and image processing
algorithms. We present a declarative approach to
writing such convolutions in Haskell that is both
efficient at runtime and implicitly parallel. To
achieve this we extend our prior work on the Repa
array library with two new features: partitioned and
cursored arrays. Combined with careful management of
the interaction between GHC and its back-end code
generator LLVM, we achieve performance comparable to
the standard OpenCV library.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2034684&ftid=1035453&dwn=1&CFID=574762219&CFTOKEN=10899110},
review = {fbie: rejected <2016-01-12 15:15:20>},
}
@inproceedings{Herhut:2009:CCS:1481839.1481847,
author = {Herhut, Stephan and Scholz, Sven-Bodo and Grelck,
Clemens},
title = {Controlling Chaos: On Safe Side-effects in
Data-parallel Operations},
booktitle = {Proceedings of the 4th Workshop on Declarative
Aspects of Multicore Programming},
series = {DAMP '09},
year = 2008,
isbn = {978-1-60558-417-1},
location = {Savannah, GA, USA},
pages = {59--67},
numpages = 9,
url = {http://doi.acm.org/10.1145/1481839.1481847},
doi = {10.1145/1481839.1481847},
acmid = 1481847,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {concurrent side-effects, functional programming
languages, non-determinism},
abstract = {With the rising variety of hardware designs for
multi-core systems, the effectiveness in exploiting
implicit concurrency of programs plays a more vital
role for programming such systems than ever
before. We believe that a combination of a
data-parallel approach with a declarative
programming-style is up to that task: Data-parallel
approaches are known to enable compilers to make
efficient use of multi-processors without requiring
low-level program annotations. Combining the
data-parallel approach with a declarative
programming-style guarantees semantic equivalence
between sequential and concurrent executions of data
parallel operations. Furthermore, the side-effect
free setting and explicit model of dependencies
enables compilers to maximise the size of the
data-parallel program sections. However, the
strength of the rigidity of the declarative approach
also constitutes its weakness: Being bound to
observe all data dependencies categorically rules
out the use of side-effecting operations within
data-parallel sections. Not only does this limit the
size of these regions in certain situations, but it
may also hamper an effective workload
distribution. Considering side effects such as
plotting individual pixels of an image or output for
debugging purposes, there are situations where a
non-deterministic order of side-effects would not be
considered harmful at all. We propose a mechanism
for enabling such non-determinism on the execution
of side-effecting operations within data-parallel
sections without sacrificing the side-effect free
setting in general. Outside of the data-parallel
sections we ensure single-threading of
side-effecting operations using uniqueness
typing. Within data-parallel operations however we
allow the side-effecting operations of different
threads to occur in any order, as long as effects of
different threads are not interleaved. Furthermore,
we still model the dependencies arising from the
manipulated states within the data parallel
sections. This measure preserves the explicitness of
all data dependencies and therefore it preserves the
transformational potential of any restructuring
compiler.},
review = {fbie: rejected <2016-01-12 15:16:34>},
}
@inproceedings{Grelck:2007:SOS:1248648.1248654,
author = {Grelck, Clemens and Scholz, Sven-Bodo},
title = {SAC: Off-the-shelf Support for Data-parallelism on
Multicores},
booktitle = {Proceedings of the 2007 Workshop on Declarative
Aspects of Multicore Programming},
series = {DAMP '07},
year = 2007,
isbn = {978-1-59593-690-5},
location = {Nice, France},
pages = {25--33},
numpages = 9,
url = {http://doi.acm.org/10.1145/1248648.1248654},
doi = {10.1145/1248648.1248654},
acmid = 1248654,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {S<sc>a</sc>C, Single assignment C, automatic
parallelisation, data parallel programming, generic
array programming, multicore programming},
abstract = {The advent of multicore processors has raised new
demand for harnessing concurrency in the software
mass market. We summarise our previous work on the
data parallel, functional array processing language
SaC. Its compiler technology is geared towards
highly runtime-efficient support for shared memory
multiprocessors and, thus, is readily applicable to
today's off-the-shelf multicore systems.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=1248654&ftid=415640&dwn=1&CFID=574762219&CFTOKEN=10899110},
review = {fbie: accepted <2016-01-12 15:17:02>},
}
@inproceedings{Svensson:2015:CDT:2808091.2808093,
author = {Svensson, Bo Joel and Vollmer, Michael and Holk,
Eric and McDonell, Trevor L. and Newton, Ryan R.},
title = {Converting Data-parallelism to Task-parallelism by
Rewrites: Purely Functional Programs Across Multiple
GPUs},
booktitle = {Proceedings of the 4th ACM SIGPLAN Workshop on
Functional High-Performance Computing},
series = {FHPC 2015},
year = 2015,
isbn = {978-1-4503-3807-3},
location = {Vancouver, BC, Canada},
pages = {12--22},
numpages = 11,
url = {http://doi.acm.org/10.1145/2808091.2808093},
doi = {10.1145/2808091.2808093},
acmid = 2808093,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {Data-parallelism, GPU, Haskell, Multi-device,
Scheduling},
abstract = { High-level domain-specific languages for array
processing on the GPU are increasingly common, but
they typically only run on a single GPU. As
computational power is distributed across more
devices, languages must target multiple devices
simultaneously. To this end, we present a
compositional translation that fissions
data-parallel programs in the Accelerate language,
allowing subsequent compiler and runtime stages to
map computations onto multiple devices for improved
performance---even programs that begin as a single
data-parallel kernel. },
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2808093&ftid=1614992&dwn=1&CFID=574762219&CFTOKEN=10899110},
notes = {GPUs are not a relevant topic.},
review = {fbie: rejected <2016-01-12 15:17:40>},
}
@inproceedings{Svensson:2012:PPH:2364474.2364477,
author = {Svensson, Bo Joel and Sheeran, Mary},
title = {Parallel Programming in Haskell Almost for Free: An
Embedding of Intel's Array Building Blocks},
booktitle = {Proceedings of the 1st ACM SIGPLAN Workshop on
Functional High-performance Computing},
series = {FHPC '12},
year = 2012,
isbn = {978-1-4503-1577-7},
location = {Copenhagen, Denmark},
pages = {3--14},
numpages = 12,
url = {http://doi.acm.org/10.1145/2364474.2364477},
doi = {10.1145/2364474.2364477},
acmid = 2364477,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {array programming, data parallelism, dynamic
compilation, embedded language},
abstract = {Nowadays, performance in processors is increased by
adding more cores or wider vector units, or by
combining accelerators like GPUs and traditional
cores on a chip. Programming for these diverse
architectures is a challenge. We would like to
exploit all the resources at hand without putting
too much burden on the programmer. Ideally, the
programmer should be presented with a machine model
abstracted from the specific number of cores, SIMD
width or the existence of a GPU or not. Intel's
Array Building Blocks (ArBB) is a system that takes
on these challenges. ArBB is a language for data
parallel and nested data parallel programming,
embedded in C++. By offering a retargetable dynamic
compilation framework, it provides vectorisation and
threading to programmers without the need to write
highly architecture specific code. We aim to bring
the same benefits to the Haskell programmer by
implementing a Haskell frontend (embedding) of the
ArBB system. We call this embedding EmbArBB. We use
standard Haskell embedded language procedures to
provide an interface to the ArBB functionality in
Haskell. EmbArBB is work in progress and does not
currently support all of the ArBB
functionality. Some small programming examples
illustrate how the Haskell embedding is used to
write programs. ArBB code is short and to the point
in both C++ and Haskell. Matrix multiplication has
been benchmarked in sequential C++, ArBB in C++,
EmbArBB and the Repa library. The C++ and the
Haskell embeddings have almost identical
performance, showing that the Haskell embedding does
not impose any large extra overheads. Two image
processing algorithms have also been benchmarked
against Repa. In these benchmarks at least, EmbArBB
performance is much better than that of the Repa
library, indicating that building on ArBB may be a
cheap and easy approach to exploiting data
parallelism in Haskell.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2364477&ftid=1282816&dwn=1&CFID=574762219&CFTOKEN=10899110},
review = {fbie: rejected <2016-01-12 15:18:35>},
}
@inproceedings{Singh:2010:DDP:1708046.1708048,
author = {Singh, Satnam},
title = {Declarative Data-parallel Programming with the
Accelerator System},
booktitle = {Proceedings of the 5th ACM SIGPLAN Workshop on
Declarative Aspects of Multicore Programming},
series = {DAMP '10},
year = 2010,
isbn = {978-1-60558-859-9},
location = {Madrid, Spain},
pages = {1--2},
numpages = 2,
url = {http://doi.acm.org/10.1145/1708046.1708048},
doi = {10.1145/1708046.1708048},
acmid = 1708048,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {data-parallelsim},
abstract = {The Accelerator project at Microsoft Research is
developing a data-parallel library which provides a
high level and accessible mechanism for producing
code that executes on GPUs (via DirectX) and X64
multi-cores using SIMD instructions. An experimental
target can also produced VHDL netlists which can be
implemented on FPGA circuits. Although the library
is developed in a mainstream imperative language the
user programs in what is essentially a functional
embedded domain specific language. The library
provides data-parallel arrays and data-parallel
operations e.g. element-wise operations, reductions,
and matrix transformations. It is also possible to
layer higher level domain specific data-parallel
languages on top of Accelerator e.g. parallel
bitonic sorters and mergers (e.g. Batcher's) have
been expressed in a combinator based library in F#
which has appealing properties for composing
computations through the use of higher order
functions. A key distinction between the Accelerator
approach for generating GPU code and the CUDA path
supported by NVidia is that Accelerator works
on-line by jit-ing rather than off-line by
generating programs that need to be further compiled
and executed. This greatly simplifies to usage model
for the programmer. The circuit generator target for
Accelerator cannot work by ji-ting so it works in
off-line mode. The ability to target three quite
different architectures (GPUs, multi-core SIMD
instructions and FPGAs) is possible due to the
careful design of the Accelerator library by picking
just the right level of abstraction for the data and
its associated data-parallel operations. A series of
examples have been developed including applications
for image processing and motion estimation.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=1708048&ftid=743279&dwn=1&CFID=574762219&CFTOKEN=10899110},
review = {fbie: rejected <2016-01-12 15:20:55>},
}
@inproceedings{Claessen:2012:EAC:2103736.2103740,
author = {Claessen, Koen and Sheeran, Mary and Svensson, Bo
Joel},
title = {Expressive Array Constructs in an Embedded GPU
Kernel Programming Language},
booktitle = {Proceedings of the 7th Workshop on Declarative
Aspects and Applications of Multicore Programming},
series = {DAMP '12},
year = 2012,
isbn = {978-1-4503-1117-5},
location = {Philadelphia, Pennsylvania, USA},
pages = {21--30},
numpages = 10,
url = {http://doi.acm.org/10.1145/2103736.2103740},
doi = {10.1145/2103736.2103740},
acmid = 2103740,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {arrays, data parallelism, embedded domain specific
language, general purpose gpu programming, haskell},
abstract = {Graphics Processing Units (GPUs) are powerful
computing devices that with the advent of
CUDA/OpenCL are becomming useful for general purpose
computations. Obsidian is an embedded domain
specific language that generates CUDA kernels from
functional descriptions. A symbolic array
construction allows us to guarantee that
intermediate arrays are fused away. However, the
current array construction has some drawbacks; in
particular, arrays cannot be combined
efficiently. We add a new type of push arrays to the
existing Obsidian system in order to solve this
problem. The two array types complement each other,
and enable the definition of combinators that both
take apart and combine arrays, and that result in
efficient generated code. This extension to Obsidian
is demonstrated on a sequence of sorting kernels,
with good results. The case study also illustrates
the use of combinators for expressing the structure
of parallel algorithms. The work presented is
preliminary, and the combinators presented must be
generalised. However, the raw speed of the generated
kernels bodes well.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2103740&ftid=1093833&dwn=1&CFID=574762219&CFTOKEN=10899110},
review = {fbie: rejected <2016-01-12 15:23:49>},
}
@inproceedings{Madsen:2015:FAS:2808091.2808094,
author = {Madsen, Frederik M. and Clifton-Everest, Robert and
Chakravarty, Manuel M. T. and Keller, Gabriele},
title = {Functional Array Streams},
booktitle = {Proceedings of the 4th ACM SIGPLAN Workshop on
Functional High-Performance Computing},
series = {FHPC 2015},
year = 2015,
isbn = {978-1-4503-3807-3},
location = {Vancouver, BC, Canada},
pages = {23--34},
numpages = 12,
url = {http://doi.acm.org/10.1145/2808091.2808094},
doi = {10.1145/2808091.2808094},
acmid = 2808094,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {Arrays, Data parallelism, Embedded language, GPGPU,
Haskell, Streams},
abstract = { Regular array languages for high performance
computing based on aggregate operations provide a
convenient parallel programming model, which enables
the generation of efficient code for SIMD
architectures, such as GPUs. However, the data sets
that can be processed with current implementations
are severely constrained by the limited amount of
main memory available in these architectures. In
this paper, we propose an extension of the embedded
array language Accelerate with a notion of
sequences, resulting in a two level hierarchy which
allows the programmer to specify a partitioning
strategy which facilitates automatic resource
allocation. Depending on the available memory, the
runtime system processes the overall data set in
streams of chunks appropriate to the hardware
parameters. In this paper, we present the language
design for the sequence operations, as well as the
compilation and runtime support, and demonstrate
with a set of benchmarks the feasibility of this
approach. },
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2808094&ftid=1614993&dwn=1&CFID=574762219&CFTOKEN=10899110},
review = {fbie: rejected <2016-01-12 15:24:23>},
}
@inproceedings{Wernsing:2012:RHA:2380403.2380423,
author = {Wernsing, John Robert and Stitt, Greg and Fowers,
Jeremy},
title = {The RACECAR Heuristic for Automatic Function
Specialization on Multi-core Heterogeneous Systems},
booktitle = {Proceedings of the 2012 International Conference on
Compilers, Architectures and Synthesis for Embedded
Systems},
series = {CASES '12},
year = 2012,
isbn = {978-1-4503-1424-4},
location = {Tampere, Finland},
pages = {81--90},
numpages = 10,
url = {http://doi.acm.org/10.1145/2380403.2380423},
doi = {10.1145/2380403.2380423},
acmid = 2380423,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {elastic computing, execution time, fpga, gpu,
heterogeneous, optimization, performance prediction,
racecar},
abstract = {Embedded systems increasingly combine multi-core
processors and heterogeneous resources such as
graphics-processing units and field-programmable
gate arrays. However, significant application design
complexity for such systems caused by parallel
programming and device-specific challenges has often
led to untapped performance potential. Application
developers targeting such systems currently must
determine how to parallelize computation, create
different device-specialized implementations for
each heterogeneous resource, and then determine how
to apportion work to each resource. In this paper,
we present the RACECAR heuristic to automate the
optimization of applications for multi-core
heterogeneous systems by automatically exploring
implementation alternatives that include different
algorithms, parallelization strategies, and work
distributions. Experimental results show
RACECAR-specialized implementations can effectively
incorporate provided implementations and parallelize
computation across multiple cores,
graphics-processing units, and field-programmable
gate arrays, improving performance by an average of
47x compared to a CPU, while the fastest provided
implementations are only able to average 33x.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2380423&ftid=1294482&dwn=1&CFID=574762219&CFTOKEN=10899110},
review = {fbie: rejected <2016-01-12 15:25:29>},
}
@article{Keller:2012:VA:2430532.2364512,
author = {Keller, Gabriele and Chakravarty, Manuel M.T. and
Leshchinskiy, Roman and Lippmeier, Ben and Peyton
Jones, Simon},
title = {Vectorisation Avoidance},
journal = {SIGPLAN Not.},
issue_date = {December 2012},
volume = 47,
number = 12,
month = sep,
year = 2012,
issn = {0362-1340},
pages = {37--48},
numpages = 12,
url = {http://doi.acm.org/10.1145/2430532.2364512},
doi = {10.1145/2430532.2364512},
acmid = 2364512,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {haskell, nested data parallelism, program
transformation},
abstract = {Flattening nested parallelism is a vectorising code
transform that converts irregular nested parallelism
into flat data parallelism. Although the result has
good asymptotic performance, flattening thoroughly
restructures the code. Many intermediate data
structures and traversals are introduced, which may
or may not be eliminated by subsequent
optimisation. We present a novel program analysis to
identify parts of the program where flattening would
only introduce overhead, without appropriate
gain. We present empirical evidence that avoiding
vectorisation in these cases leads to more efficient
programs than if we had applied vectorisation and
then relied on array fusion to eliminate
intermediates from the resulting code.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2364512&ftid=1282873&dwn=1&CFID=574762219&CFTOKEN=10899110},
review = {fbie: accepted <2016-01-12 15:26:38>},
}
@inproceedings{Keller:2012:VA:2364506.2364512,
author = {Keller, Gabriele and Chakravarty, Manuel M.T. and
Leshchinskiy, Roman and Lippmeier, Ben and Peyton
Jones, Simon},
title = {Vectorisation Avoidance},
booktitle = {Proceedings of the 2012 Haskell Symposium},
series = {Haskell '12},
year = 2012,
isbn = {978-1-4503-1574-6},
location = {Copenhagen, Denmark},
pages = {37--48},
numpages = 12,
url = {http://doi.acm.org/10.1145/2364506.2364512},
doi = {10.1145/2364506.2364512},
acmid = 2364512,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {haskell, nested data parallelism, program
transformation},
abstract = {Flattening nested parallelism is a vectorising code
transform that converts irregular nested parallelism
into flat data parallelism. Although the result has
good asymptotic performance, flattening thoroughly
restructures the code. Many intermediate data
structures and traversals are introduced, which may
or may not be eliminated by subsequent
optimisation. We present a novel program analysis to
identify parts of the program where flattening would
only introduce overhead, without appropriate
gain. We present empirical evidence that avoiding
vectorisation in these cases leads to more efficient
programs than if we had applied vectorisation and
then relied on array fusion to eliminate
intermediates from the resulting code.},
fullTextUrl =
{http://dl.acm.org/ft_gateway.cfm?id=2364512&ftid=1282873&dwn=1&CFID=574762219&CFTOKEN=10899110},
review = {fbie: accepted <2016-01-12 15:26:42>},
}
@inproceedings{Shafarenko:2002:CHT:571157.571160,
author = {Shafarenko, Alex},
title = {Coercion As Homomorphism: Type Inference in a System
with Subtyping and Overloading},
booktitle = {Proceedings of the 4th ACM SIGPLAN International
Conference on Principles and Practice of Declarative
Programming},
series = {PPDP '02},
year = 2002,
isbn = {1-58113-528-9},
location = {Pittsburgh, PA, USA},
pages = {14--25},
numpages = 12,
url = {http://doi.acm.org/10.1145/571157.571160},
doi = {10.1145/571157.571160},
acmid = 571160,
publisher = {ACM},
address = {New York, NY, USA},
keywords = {array processing, data-parallel programming,
overloading, subtyping, type inference},
fullTextUrl =