array-programming-acm.bib

@inproceedings{Collins:2014:NFL:2627373.2627375,
  author =	 {Collins, Alexander and Grewe, Dominik and Grover,
                  Vinod and Lee, Sean and Susnea, Adriana},
  title =	 {NOVA: A Functional Language for Data Parallelism},
  booktitle =	 {Proceedings of ACM SIGPLAN International Workshop on
                  Libraries, Languages, and Compilers for Array
                  Programming},
  series =	 {ARRAY'14},
  year =	 2014,
  isbn =	 {978-1-4503-2937-8},
  location =	 {Edinburgh, United Kingdom},
  pages =	 {8:8--8:13},
  articleno =	 8,
  numpages =	 6,
  url =		 {http://doi.acm.org/10.1145/2627373.2627375},
  doi =		 {10.1145/2627373.2627375},
  acmid =	 2627375,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Array-oriented programming, CUDA, Code generation,
                  Compilation, Functional programming, Multi-core CPU},
  abstract =	 {Functional languages provide a solid foundation on
                  which complex optimization passes can be designed to
                  exploit parallelism available in the underlying
                  system. Their mathematical foundations enable
                  high-level optimizations that would be impossible in
                  traditional imperative languages. This makes them
                  uniquely suited for generation of efficient target
                  code for parallel systems, such as multiple Central
                  Processing Units (CPUs) or highly data-parallel
                  Graphics Processing Units (GPUs). Such systems are
                  becoming the mainstream for scientific and commodity
                  desktop computing. Writing performance portable code
                  for such systems using low-level languages requires
                  significant effort from a human expert. This paper
                  presents NOVA, a functional language and compiler
                  for multi-core CPUs and GPUs. The NOVA language is a
                  polymorphic, statically-typed functional language
                  with a suite of higher-order functions which are
                  used to express parallelism. These include map,
                  reduce and scan. The NOVA compiler is a
                  light-weight, yet powerful, optimizing compiler. It
                  generates code for a variety of target platforms
                  that achieve performance comparable to competing
                  languages and tools, including hand-optimized
                  code. The NOVA compiler is stand-alone and can be
                  easily used as a target for higher-level or domain
                  specific languages or embedded in other
                  applications. We evaluate NOVA against two competing
                  approaches: the Thrust library and hand-written CUDA
                  C. NOVA achieves comparable performance to these
                  approaches across a range of
                  benchmarks. NOVA-generated code also scales linearly
                  with the number of processor cores across all
                  compute-bound benchmarks.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2627375&ftid=1503126&dwn=1&CFID=574747772&CFTOKEN=80047865},
  review =	 {fbie: accepted <2016-01-12 15:08:12>},
}

@article{Keller:2010:RSP:1932681.1863582,
  author =	 {Keller, Gabriele and Chakravarty, Manuel M.T. and
                  Leshchinskiy, Roman and Peyton Jones, Simon and
                  Lippmeier, Ben},
  title =	 {Regular, Shape-polymorphic, Parallel Arrays in
                  Haskell},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {September 2010},
  volume =	 45,
  number =	 9,
  month =	 sep,
  year =	 2010,
  issn =	 {0362-1340},
  pages =	 {261--272},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/1932681.1863582},
  doi =		 {10.1145/1932681.1863582},
  acmid =	 1863582,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {arrays, data parallelism, haskell},
  abstract =	 {We present a novel approach to regular,
                  multi-dimensional arrays in Haskell. The main
                  highlights of our approach are that it (1) is purely
                  functional, (2) supports reuse through shape
                  polymorphism, (3) avoids unnecessary intermediate
                  structures rather than relying on subsequent loop
                  fusion, and (4) supports transparent
                  parallelisation. We show how to embed two forms of
                  shape polymorphism into Haskell's type system using
                  type classes and type families. In particular, we
                  discuss the generalisation of regular array
                  transformations to arrays of higher rank, and
                  introduce a type-safe specification of array
                  slices. We discuss the runtime performance of our
                  approach for three standard array algorithms. We
                  achieve absolute performance comparable to
                  handwritten C code. At the same time, our
                  implementation scales well up to 8 processor cores.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1863582&ftid=845214&dwn=1&CFID=574747772&CFTOKEN=80047865},
  review =	 {fbie: accepted <2016-01-12 15:08:58>},
}

@inproceedings{Keller:2010:RSP:1863543.1863582,
  author =	 {Keller, Gabriele and Chakravarty, Manuel M.T. and
                  Leshchinskiy, Roman and Peyton Jones, Simon and
                  Lippmeier, Ben},
  title =	 {Regular, Shape-polymorphic, Parallel Arrays in
                  Haskell},
  booktitle =	 {Proceedings of the 15th ACM SIGPLAN International
                  Conference on Functional Programming},
  series =	 {ICFP '10},
  year =	 2010,
  isbn =	 {978-1-60558-794-3},
  location =	 {Baltimore, Maryland, USA},
  pages =	 {261--272},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/1863543.1863582},
  doi =		 {10.1145/1863543.1863582},
  acmid =	 1863582,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {arrays, data parallelism, haskell},
  abstract =	 {We present a novel approach to regular,
                  multi-dimensional arrays in Haskell. The main
                  highlights of our approach are that it (1) is purely
                  functional, (2) supports reuse through shape
                  polymorphism, (3) avoids unnecessary intermediate
                  structures rather than relying on subsequent loop
                  fusion, and (4) supports transparent
                  parallelisation. We show how to embed two forms of
                  shape polymorphism into Haskell's type system using
                  type classes and type families. In particular, we
                  discuss the generalisation of regular array
                  transformations to arrays of higher rank, and
                  introduce a type-safe specification of array
                  slices. We discuss the runtime performance of our
                  approach for three standard array algorithms. We
                  achieve absolute performance comparable to
                  handwritten C code. At the same time, our
                  implementation scales well up to 8 processor cores.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1863582&ftid=845214&dwn=1&CFID=574747772&CFTOKEN=80047865},
  review =	 {fbie: accepted <2016-01-12 15:09:02>},
}

@article{McKenney:1992:GPC:130616.130622,
  author =	 {McKenney, Bruce and Szymanski, Boleslaw K.},
  title =	 {Generating Parallel Code for SIMD Machines},
  journal =	 {ACM Lett. Program. Lang. Syst.},
  issue_date =	 {March 1992},
  volume =	 1,
  number =	 1,
  month =	 mar,
  year =	 1992,
  issn =	 {1057-4514},
  pages =	 {59--73},
  numpages =	 15,
  url =		 {http://doi.acm.org/10.1145/130616.130622},
  doi =		 {10.1145/130616.130622},
  acmid =	 130622,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {data parallelism},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=130622&ftid=32971&dwn=1&CFID=574747772&CFTOKEN=80047865},
  abstract = 	 {

             Massively parallel SIMD machines rely on data parallelism usually achieved by a careful hand coding to support program efficiency. This paper describes parallelization of code generated for SIMD machines by the compiler for the Equational Programming Language, EPL. The language supports architecture-independent scientific programming by recurrent equations. The EPL compiler serves as a programming aid for users of parallel machines by automating data partitioning and computation parallelization based on inherent data dependencies. In support of a Connection Machine architecture, the EPL compiler performs horizontal partitioning of the program, a process that selects a dimension of each data structure to be projected along the processor array. Each processor then holds a single instance of that structure and operations along the projected dimension are done in parallel. The paper describes horizontal partitioning, code generation in MPL and efficiency of programs generated for Maspar SIMD machine. },
  review = 	 {fbie: rejected <2016-01-14 12:09:51>},

}

@inproceedings{Lippmeier:2012:WEH:2364527.2364564,
  author =	 {Lippmeier, Ben and Chakravarty, Manuel M.T. and
                  Keller, Gabriele and Leshchinskiy, Roman and Peyton
                  Jones, Simon},
  title =	 {Work Efficient Higher-order Vectorisation},
  booktitle =	 {Proceedings of the 17th ACM SIGPLAN International
                  Conference on Functional Programming},
  series =	 {ICFP '12},
  year =	 2012,
  isbn =	 {978-1-4503-1054-3},
  location =	 {Copenhagen, Denmark},
  pages =	 {259--270},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2364527.2364564},
  doi =		 {10.1145/2364527.2364564},
  acmid =	 2364564,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {arrays, data parallelism, haskell},
  abstract =	 {Existing approaches to higher-order vectorisation,
                  also known as flattening nested data parallelism, do
                  not preserve the asymptotic work complexity of the
                  source program. Straightforward examples, such as
                  sparse matrix-vector multiplication, can suffer a
                  severe blow-up in both time and space, which limits
                  the practicality of this method. We discuss why this
                  problem arises, identify the mis-handling of index
                  space transforms as the root cause, and present a
                  solution using a refined representation of nested
                  arrays. We have implemented this solution in Data
                  Parallel Haskell (DPH) and present benchmarks
                  showing that realistic programs, which used to
                  suffer the blow-up, now have the correct asymptotic
                  work complexity. In some cases, the asymptotic
                  complexity of the vectorised program is even better
                  than the original.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2364564&ftid=1282930&dwn=1&CFID=574747772&CFTOKEN=80047865},
  review =	 {fbie: accepted <2016-01-12 15:09:45>},
}

@article{Lippmeier:2012:WEH:2398856.2364564,
  author =	 {Lippmeier, Ben and Chakravarty, Manuel M.T. and
                  Keller, Gabriele and Leshchinskiy, Roman and Peyton
                  Jones, Simon},
  title =	 {Work Efficient Higher-order Vectorisation},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {September 2012},
  volume =	 47,
  number =	 9,
  month =	 sep,
  year =	 2012,
  issn =	 {0362-1340},
  pages =	 {259--270},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2398856.2364564},
  doi =		 {10.1145/2398856.2364564},
  acmid =	 2364564,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {arrays, data parallelism, haskell},
  abstract =	 {Existing approaches to higher-order vectorisation,
                  also known as flattening nested data parallelism, do
                  not preserve the asymptotic work complexity of the
                  source program. Straightforward examples, such as
                  sparse matrix-vector multiplication, can suffer a
                  severe blow-up in both time and space, which limits
                  the practicality of this method. We discuss why this
                  problem arises, identify the mis-handling of index
                  space transforms as the root cause, and present a
                  solution using a refined representation of nested
                  arrays. We have implemented this solution in Data
                  Parallel Haskell (DPH) and present benchmarks
                  showing that realistic programs, which used to
                  suffer the blow-up, now have the correct asymptotic
                  work complexity. In some cases, the asymptotic
                  complexity of the vectorised program is even better
                  than the original.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2364564&ftid=1282930&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: accepted <2016-01-12 15:09:53>},
}

@inproceedings{Chakravarty:2013:DPH:2502323.2508151,
  author =	 {Chakravarty, Manuel M.T.},
  title =	 {Data Parallelism in Haskell},
  booktitle =	 {Proceedings of the 2Nd ACM SIGPLAN Workshop on
                  Functional High-performance Computing},
  series =	 {FHPC '13},
  year =	 2013,
  isbn =	 {978-1-4503-2381-9},
  location =	 {Boston, Massachusetts, USA},
  pages =	 {97--98},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/2502323.2508151},
  doi =		 {10.1145/2502323.2508151},
  acmid =	 2508151,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {array programming, code optimisation, data
                  parallelism, haskell},
  abstract =	 {The implicit data parallelism in collective
                  operations on aggregate data structures constitutes
                  an attractive parallel programming model for
                  functional languages. Beginning with our work on
                  integrating nested data parallelism into Haskell, we
                  explored a variety of different approaches to
                  array-centric data parallel programming in Haskell,
                  experimented with a range of code generation and
                  optimisation strategies, and targeted both multicore
                  CPUs and GPUs. In addition to practical tools for
                  parallel programming, the outcomes of this research
                  programme include more widely applicable concepts,
                  such as Haskell's type families and stream
                  fusion. In this talk, I will contrast the different
                  approaches to data parallel programming that we
                  explored. I will discuss their strengths and
                  weaknesses and review what we have learnt in the
                  course of exploring the various options. This
                  includes our experience of implementing these
                  approaches in the Glasgow Haskell Compiler as well
                  the experimental results that we have gathered so
                  far. Finally, I will outline the remaining open
                  challenges and our plans for the future. This talk
                  is based on joint work with Gabriele Keller, Sean
                  Lee, Roman Leshchinskiy, Ben Lippmeier, Trevor
                  L. McDonell, and Simon Peyton Jones.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2508151&ftid=1397480&dwn=1&CFID=574762219&CFTOKEN=10899110},
  fullTextFile = {.slirm_cache/Chakravarty_2013_Data.pdf},
  notes =	 {This is a talk, the paper consists only of the
                  abstract.},
  review =	 {fbie: accepted <2016-01-12 15:12:05>},
}

@inproceedings{Chakravarty:2011:AHA:1926354.1926358,
  author =	 {Chakravarty, Manuel M.T. and Keller, Gabriele and
                  Lee, Sean and McDonell, Trevor L. and Grover, Vinod},
  title =	 {Accelerating Haskell Array Codes with Multicore
                  GPUs},
  booktitle =	 {Proceedings of the Sixth Workshop on Declarative
                  Aspects of Multicore Programming},
  series =	 {DAMP '11},
  year =	 2011,
  isbn =	 {978-1-4503-0486-3},
  location =	 {Austin, Texas, USA},
  pages =	 {3--14},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/1926354.1926358},
  doi =		 {10.1145/1926354.1926358},
  acmid =	 1926358,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {arrays, data parallelism, dynamic compilation,
                  gpgpu, haskell, skeletons},
  abstract =	 {Current GPUs are massively parallel multicore
                  processors optimised for workloads with a large
                  degree of SIMD parallelism. Good performance
                  requires highly idiomatic programs, whose
                  development is work intensive and requires expert
                  knowledge. To raise the level of abstraction, we
                  propose a domain-specific high-level language of
                  array computations that captures appropriate idioms
                  in the form of collective array operations. We embed
                  this purely functional array language in Haskell
                  with an online code generator for NVIDIA's CUDA
                  GPGPU programming environment. We regard the
                  embedded language's collective array operations as
                  algorithmic skeletons; our code generator
                  instantiates CUDA implementations of those skeletons
                  to execute embedded array programs. This paper
                  outlines our embedding in Haskell, details the
                  design and implementation of the dynamic code
                  generator, and reports on initial benchmark
                  results. These results suggest that we can compete
                  with moderately optimised native CUDA code, while
                  enabling much simpler source programs.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1926358&ftid=907606&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:13:14>},
}

@article{Lippmeier:2012:GPA:2430532.2364511,
  author =	 {Lippmeier, Ben and Chakravarty, Manuel and Keller,
                  Gabriele and Peyton Jones, Simon},
  title =	 {Guiding Parallel Array Fusion with Indexed Types},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {December 2012},
  volume =	 47,
  number =	 12,
  month =	 sep,
  year =	 2012,
  issn =	 {0362-1340},
  pages =	 {25--36},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2430532.2364511},
  doi =		 {10.1145/2430532.2364511},
  acmid =	 2364511,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {arrays, data parallelism, haskell},
  abstract =	 {We present a refined approach to parallel array
                  fusion that uses indexed types to specify the
                  internal representation of each array. Our approach
                  aids the client programmer in reasoning about the
                  performance of their program in terms of the source
                  code. It also makes the intermediate code easier to
                  transform at compile-time, resulting in faster
                  compilation and more reliable runtimes. We
                  demonstrate how our new approach improves both the
                  clarity and performance of several end-user written
                  programs, including a fluid flow solver and an
                  interpolator for volumetric data.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2364511&ftid=1282872&dwn=1&CFID=574762219&CFTOKEN=10899110},
  notes =	 {Not quite sure, will accept it tentatively.},
  review =	 {fbie: accepted <2016-01-12 15:14:07>},
}

@inproceedings{Lippmeier:2012:GPA:2364506.2364511,
  author =	 {Lippmeier, Ben and Chakravarty, Manuel and Keller,
                  Gabriele and Peyton Jones, Simon},
  title =	 {Guiding Parallel Array Fusion with Indexed Types},
  booktitle =	 {Proceedings of the 2012 Haskell Symposium},
  series =	 {Haskell '12},
  year =	 2012,
  isbn =	 {978-1-4503-1574-6},
  location =	 {Copenhagen, Denmark},
  pages =	 {25--36},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2364506.2364511},
  doi =		 {10.1145/2364506.2364511},
  acmid =	 2364511,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {arrays, data parallelism, haskell},
  abstract =	 {We present a refined approach to parallel array
                  fusion that uses indexed types to specify the
                  internal representation of each array. Our approach
                  aids the client programmer in reasoning about the
                  performance of their program in terms of the source
                  code. It also makes the intermediate code easier to
                  transform at compile-time, resulting in faster
                  compilation and more reliable runtimes. We
                  demonstrate how our new approach improves both the
                  clarity and performance of several end-user written
                  programs, including a fluid flow solver and an
                  interpolator for volumetric data.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2364511&ftid=1282872&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: accepted <2016-01-12 15:14:17>},
}

@article{Lippmeier:2011:EPS:2096148.2034684,
  author =	 {Lippmeier, Ben and Keller, Gabriele},
  title =	 {Efficient Parallel Stencil Convolution in Haskell},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {December 2011},
  volume =	 46,
  number =	 12,
  month =	 sep,
  year =	 2011,
  issn =	 {0362-1340},
  pages =	 {59--70},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2096148.2034684},
  doi =		 {10.1145/2096148.2034684},
  acmid =	 2034684,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {arrays, data parallelism, haskell},
  abstract =	 {Stencil convolution is a fundamental building block
                  of many scientific and image processing
                  algorithms. We present a declarative approach to
                  writing such convolutions in Haskell that is both
                  efficient at runtime and implicitly parallel. To
                  achieve this we extend our prior work on the Repa
                  array library with two new features: partitioned and
                  cursored arrays. Combined with careful management of
                  the interaction between GHC and its back-end code
                  generator LLVM, we achieve performance comparable to
                  the standard OpenCV library.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2034684&ftid=1035453&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:15:13>},
}

@inproceedings{Lippmeier:2011:EPS:2034675.2034684,
  author =	 {Lippmeier, Ben and Keller, Gabriele},
  title =	 {Efficient Parallel Stencil Convolution in Haskell},
  booktitle =	 {Proceedings of the 4th ACM Symposium on Haskell},
  series =	 {Haskell '11},
  year =	 2011,
  isbn =	 {978-1-4503-0860-1},
  location =	 {Tokyo, Japan},
  pages =	 {59--70},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2034675.2034684},
  doi =		 {10.1145/2034675.2034684},
  acmid =	 2034684,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {arrays, data parallelism, haskell},
  abstract =	 {Stencil convolution is a fundamental building block
                  of many scientific and image processing
                  algorithms. We present a declarative approach to
                  writing such convolutions in Haskell that is both
                  efficient at runtime and implicitly parallel. To
                  achieve this we extend our prior work on the Repa
                  array library with two new features: partitioned and
                  cursored arrays. Combined with careful management of
                  the interaction between GHC and its back-end code
                  generator LLVM, we achieve performance comparable to
                  the standard OpenCV library.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2034684&ftid=1035453&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:15:20>},
}

@inproceedings{Herhut:2009:CCS:1481839.1481847,
  author =	 {Herhut, Stephan and Scholz, Sven-Bodo and Grelck,
                  Clemens},
  title =	 {Controlling Chaos: On Safe Side-effects in
                  Data-parallel Operations},
  booktitle =	 {Proceedings of the 4th Workshop on Declarative
                  Aspects of Multicore Programming},
  series =	 {DAMP '09},
  year =	 2008,
  isbn =	 {978-1-60558-417-1},
  location =	 {Savannah, GA, USA},
  pages =	 {59--67},
  numpages =	 9,
  url =		 {http://doi.acm.org/10.1145/1481839.1481847},
  doi =		 {10.1145/1481839.1481847},
  acmid =	 1481847,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {concurrent side-effects, functional programming
                  languages, non-determinism},
  abstract =	 {With the rising variety of hardware designs for
                  multi-core systems, the effectiveness in exploiting
                  implicit concurrency of programs plays a more vital
                  role for programming such systems than ever
                  before. We believe that a combination of a
                  data-parallel approach with a declarative
                  programming-style is up to that task: Data-parallel
                  approaches are known to enable compilers to make
                  efficient use of multi-processors without requiring
                  low-level program annotations. Combining the
                  data-parallel approach with a declarative
                  programming-style guarantees semantic equivalence
                  between sequential and concurrent executions of data
                  parallel operations. Furthermore, the side-effect
                  free setting and explicit model of dependencies
                  enables compilers to maximise the size of the
                  data-parallel program sections. However, the
                  strength of the rigidity of the declarative approach
                  also constitutes its weakness: Being bound to
                  observe all data dependencies categorically rules
                  out the use of side-effecting operations within
                  data-parallel sections. Not only does this limit the
                  size of these regions in certain situations, but it
                  may also hamper an effective workload
                  distribution. Considering side effects such as
                  plotting individual pixels of an image or output for
                  debugging purposes, there are situations where a
                  non-deterministic order of side-effects would not be
                  considered harmful at all. We propose a mechanism
                  for enabling such non-determinism on the execution
                  of side-effecting operations within data-parallel
                  sections without sacrificing the side-effect free
                  setting in general. Outside of the data-parallel
                  sections we ensure single-threading of
                  side-effecting operations using uniqueness
                  typing. Within data-parallel operations however we
                  allow the side-effecting operations of different
                  threads to occur in any order, as long as effects of
                  different threads are not interleaved. Furthermore,
                  we still model the dependencies arising from the
                  manipulated states within the data parallel
                  sections. This measure preserves the explicitness of
                  all data dependencies and therefore it preserves the
                  transformational potential of any restructuring
                  compiler.},
  review =	 {fbie: rejected <2016-01-12 15:16:34>},
}

@inproceedings{Grelck:2007:SOS:1248648.1248654,
  author =	 {Grelck, Clemens and Scholz, Sven-Bodo},
  title =	 {SAC: Off-the-shelf Support for Data-parallelism on
                  Multicores},
  booktitle =	 {Proceedings of the 2007 Workshop on Declarative
                  Aspects of Multicore Programming},
  series =	 {DAMP '07},
  year =	 2007,
  isbn =	 {978-1-59593-690-5},
  location =	 {Nice, France},
  pages =	 {25--33},
  numpages =	 9,
  url =		 {http://doi.acm.org/10.1145/1248648.1248654},
  doi =		 {10.1145/1248648.1248654},
  acmid =	 1248654,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {S<sc>a</sc>C, Single assignment C, automatic
                  parallelisation, data parallel programming, generic
                  array programming, multicore programming},
  abstract =	 {The advent of multicore processors has raised new
                  demand for harnessing concurrency in the software
                  mass market. We summarise our previous work on the
                  data parallel, functional array processing language
                  SaC. Its compiler technology is geared towards
                  highly runtime-efficient support for shared memory
                  multiprocessors and, thus, is readily applicable to
                  today's off-the-shelf multicore systems.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1248654&ftid=415640&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: accepted <2016-01-12 15:17:02>},
}

@inproceedings{Svensson:2015:CDT:2808091.2808093,
  author =	 {Svensson, Bo Joel and Vollmer, Michael and Holk,
                  Eric and McDonell, Trevor L. and Newton, Ryan R.},
  title =	 {Converting Data-parallelism to Task-parallelism by
                  Rewrites: Purely Functional Programs Across Multiple
                  GPUs},
  booktitle =	 {Proceedings of the 4th ACM SIGPLAN Workshop on
                  Functional High-Performance Computing},
  series =	 {FHPC 2015},
  year =	 2015,
  isbn =	 {978-1-4503-3807-3},
  location =	 {Vancouver, BC, Canada},
  pages =	 {12--22},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/2808091.2808093},
  doi =		 {10.1145/2808091.2808093},
  acmid =	 2808093,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Data-parallelism, GPU, Haskell, Multi-device,
                  Scheduling},
  abstract =	 { High-level domain-specific languages for array
                  processing on the GPU are increasingly common, but
                  they typically only run on a single GPU. As
                  computational power is distributed across more
                  devices, languages must target multiple devices
                  simultaneously. To this end, we present a
                  compositional translation that fissions
                  data-parallel programs in the Accelerate language,
                  allowing subsequent compiler and runtime stages to
                  map computations onto multiple devices for improved
                  performance---even programs that begin as a single
                  data-parallel kernel. },
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2808093&ftid=1614992&dwn=1&CFID=574762219&CFTOKEN=10899110},
  notes =	 {GPUs are not a relevant topic.},
  review =	 {fbie: rejected <2016-01-12 15:17:40>},
}

@inproceedings{Svensson:2012:PPH:2364474.2364477,
  author =	 {Svensson, Bo Joel and Sheeran, Mary},
  title =	 {Parallel Programming in Haskell Almost for Free: An
                  Embedding of Intel's Array Building Blocks},
  booktitle =	 {Proceedings of the 1st ACM SIGPLAN Workshop on
                  Functional High-performance Computing},
  series =	 {FHPC '12},
  year =	 2012,
  isbn =	 {978-1-4503-1577-7},
  location =	 {Copenhagen, Denmark},
  pages =	 {3--14},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2364474.2364477},
  doi =		 {10.1145/2364474.2364477},
  acmid =	 2364477,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {array programming, data parallelism, dynamic
                  compilation, embedded language},
  abstract =	 {Nowadays, performance in processors is increased by
                  adding more cores or wider vector units, or by
                  combining accelerators like GPUs and traditional
                  cores on a chip. Programming for these diverse
                  architectures is a challenge. We would like to
                  exploit all the resources at hand without putting
                  too much burden on the programmer. Ideally, the
                  programmer should be presented with a machine model
                  abstracted from the specific number of cores, SIMD
                  width or the existence of a GPU or not. Intel's
                  Array Building Blocks (ArBB) is a system that takes
                  on these challenges. ArBB is a language for data
                  parallel and nested data parallel programming,
                  embedded in C++. By offering a retargetable dynamic
                  compilation framework, it provides vectorisation and
                  threading to programmers without the need to write
                  highly architecture specific code. We aim to bring
                  the same benefits to the Haskell programmer by
                  implementing a Haskell frontend (embedding) of the
                  ArBB system. We call this embedding EmbArBB. We use
                  standard Haskell embedded language procedures to
                  provide an interface to the ArBB functionality in
                  Haskell. EmbArBB is work in progress and does not
                  currently support all of the ArBB
                  functionality. Some small programming examples
                  illustrate how the Haskell embedding is used to
                  write programs. ArBB code is short and to the point
                  in both C++ and Haskell. Matrix multiplication has
                  been benchmarked in sequential C++, ArBB in C++,
                  EmbArBB and the Repa library. The C++ and the
                  Haskell embeddings have almost identical
                  performance, showing that the Haskell embedding does
                  not impose any large extra overheads. Two image
                  processing algorithms have also been benchmarked
                  against Repa. In these benchmarks at least, EmbArBB
                  performance is much better than that of the Repa
                  library, indicating that building on ArBB may be a
                  cheap and easy approach to exploiting data
                  parallelism in Haskell.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2364477&ftid=1282816&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:18:35>},
}

@inproceedings{Singh:2010:DDP:1708046.1708048,
  author =	 {Singh, Satnam},
  title =	 {Declarative Data-parallel Programming with the
                  Accelerator System},
  booktitle =	 {Proceedings of the 5th ACM SIGPLAN Workshop on
                  Declarative Aspects of Multicore Programming},
  series =	 {DAMP '10},
  year =	 2010,
  isbn =	 {978-1-60558-859-9},
  location =	 {Madrid, Spain},
  pages =	 {1--2},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/1708046.1708048},
  doi =		 {10.1145/1708046.1708048},
  acmid =	 1708048,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {data-parallelsim},
  abstract =	 {The Accelerator project at Microsoft Research is
                  developing a data-parallel library which provides a
                  high level and accessible mechanism for producing
                  code that executes on GPUs (via DirectX) and X64
                  multi-cores using SIMD instructions. An experimental
                  target can also produced VHDL netlists which can be
                  implemented on FPGA circuits. Although the library
                  is developed in a mainstream imperative language the
                  user programs in what is essentially a functional
                  embedded domain specific language. The library
                  provides data-parallel arrays and data-parallel
                  operations e.g. element-wise operations, reductions,
                  and matrix transformations. It is also possible to
                  layer higher level domain specific data-parallel
                  languages on top of Accelerator e.g. parallel
                  bitonic sorters and mergers (e.g. Batcher's) have
                  been expressed in a combinator based library in F#
                  which has appealing properties for composing
                  computations through the use of higher order
                  functions. A key distinction between the Accelerator
                  approach for generating GPU code and the CUDA path
                  supported by NVidia is that Accelerator works
                  on-line by jit-ing rather than off-line by
                  generating programs that need to be further compiled
                  and executed. This greatly simplifies to usage model
                  for the programmer. The circuit generator target for
                  Accelerator cannot work by ji-ting so it works in
                  off-line mode. The ability to target three quite
                  different architectures (GPUs, multi-core SIMD
                  instructions and FPGAs) is possible due to the
                  careful design of the Accelerator library by picking
                  just the right level of abstraction for the data and
                  its associated data-parallel operations. A series of
                  examples have been developed including applications
                  for image processing and motion estimation.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1708048&ftid=743279&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:20:55>},
}

@inproceedings{Claessen:2012:EAC:2103736.2103740,
  author =	 {Claessen, Koen and Sheeran, Mary and Svensson, Bo
                  Joel},
  title =	 {Expressive Array Constructs in an Embedded GPU
                  Kernel Programming Language},
  booktitle =	 {Proceedings of the 7th Workshop on Declarative
                  Aspects and Applications of Multicore Programming},
  series =	 {DAMP '12},
  year =	 2012,
  isbn =	 {978-1-4503-1117-5},
  location =	 {Philadelphia, Pennsylvania, USA},
  pages =	 {21--30},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/2103736.2103740},
  doi =		 {10.1145/2103736.2103740},
  acmid =	 2103740,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {arrays, data parallelism, embedded domain specific
                  language, general purpose gpu programming, haskell},
  abstract =	 {Graphics Processing Units (GPUs) are powerful
                  computing devices that with the advent of
                  CUDA/OpenCL are becomming useful for general purpose
                  computations. Obsidian is an embedded domain
                  specific language that generates CUDA kernels from
                  functional descriptions. A symbolic array
                  construction allows us to guarantee that
                  intermediate arrays are fused away. However, the
                  current array construction has some drawbacks; in
                  particular, arrays cannot be combined
                  efficiently. We add a new type of push arrays to the
                  existing Obsidian system in order to solve this
                  problem. The two array types complement each other,
                  and enable the definition of combinators that both
                  take apart and combine arrays, and that result in
                  efficient generated code. This extension to Obsidian
                  is demonstrated on a sequence of sorting kernels,
                  with good results. The case study also illustrates
                  the use of combinators for expressing the structure
                  of parallel algorithms. The work presented is
                  preliminary, and the combinators presented must be
                  generalised. However, the raw speed of the generated
                  kernels bodes well.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2103740&ftid=1093833&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:23:49>},
}

@inproceedings{Madsen:2015:FAS:2808091.2808094,
  author =	 {Madsen, Frederik M. and Clifton-Everest, Robert and
                  Chakravarty, Manuel M. T. and Keller, Gabriele},
  title =	 {Functional Array Streams},
  booktitle =	 {Proceedings of the 4th ACM SIGPLAN Workshop on
                  Functional High-Performance Computing},
  series =	 {FHPC 2015},
  year =	 2015,
  isbn =	 {978-1-4503-3807-3},
  location =	 {Vancouver, BC, Canada},
  pages =	 {23--34},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2808091.2808094},
  doi =		 {10.1145/2808091.2808094},
  acmid =	 2808094,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Arrays, Data parallelism, Embedded language, GPGPU,
                  Haskell, Streams},
  abstract =	 { Regular array languages for high performance
                  computing based on aggregate operations provide a
                  convenient parallel programming model, which enables
                  the generation of efficient code for SIMD
                  architectures, such as GPUs. However, the data sets
                  that can be processed with current implementations
                  are severely constrained by the limited amount of
                  main memory available in these architectures. In
                  this paper, we propose an extension of the embedded
                  array language Accelerate with a notion of
                  sequences, resulting in a two level hierarchy which
                  allows the programmer to specify a partitioning
                  strategy which facilitates automatic resource
                  allocation. Depending on the available memory, the
                  runtime system processes the overall data set in
                  streams of chunks appropriate to the hardware
                  parameters. In this paper, we present the language
                  design for the sequence operations, as well as the
                  compilation and runtime support, and demonstrate
                  with a set of benchmarks the feasibility of this
                  approach. },
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2808094&ftid=1614993&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:24:23>},
}

@inproceedings{Wernsing:2012:RHA:2380403.2380423,
  author =	 {Wernsing, John Robert and Stitt, Greg and Fowers,
                  Jeremy},
  title =	 {The RACECAR Heuristic for Automatic Function
                  Specialization on Multi-core Heterogeneous Systems},
  booktitle =	 {Proceedings of the 2012 International Conference on
                  Compilers, Architectures and Synthesis for Embedded
                  Systems},
  series =	 {CASES '12},
  year =	 2012,
  isbn =	 {978-1-4503-1424-4},
  location =	 {Tampere, Finland},
  pages =	 {81--90},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/2380403.2380423},
  doi =		 {10.1145/2380403.2380423},
  acmid =	 2380423,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {elastic computing, execution time, fpga, gpu,
                  heterogeneous, optimization, performance prediction,
                  racecar},
  abstract =	 {Embedded systems increasingly combine multi-core
                  processors and heterogeneous resources such as
                  graphics-processing units and field-programmable
                  gate arrays. However, significant application design
                  complexity for such systems caused by parallel
                  programming and device-specific challenges has often
                  led to untapped performance potential. Application
                  developers targeting such systems currently must
                  determine how to parallelize computation, create
                  different device-specialized implementations for
                  each heterogeneous resource, and then determine how
                  to apportion work to each resource. In this paper,
                  we present the RACECAR heuristic to automate the
                  optimization of applications for multi-core
                  heterogeneous systems by automatically exploring
                  implementation alternatives that include different
                  algorithms, parallelization strategies, and work
                  distributions. Experimental results show
                  RACECAR-specialized implementations can effectively
                  incorporate provided implementations and parallelize
                  computation across multiple cores,
                  graphics-processing units, and field-programmable
                  gate arrays, improving performance by an average of
                  47x compared to a CPU, while the fastest provided
                  implementations are only able to average 33x.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2380423&ftid=1294482&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:25:29>},
}

@article{Keller:2012:VA:2430532.2364512,
  author =	 {Keller, Gabriele and Chakravarty, Manuel M.T. and
                  Leshchinskiy, Roman and Lippmeier, Ben and Peyton
                  Jones, Simon},
  title =	 {Vectorisation Avoidance},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {December 2012},
  volume =	 47,
  number =	 12,
  month =	 sep,
  year =	 2012,
  issn =	 {0362-1340},
  pages =	 {37--48},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2430532.2364512},
  doi =		 {10.1145/2430532.2364512},
  acmid =	 2364512,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {haskell, nested data parallelism, program
                  transformation},
  abstract =	 {Flattening nested parallelism is a vectorising code
                  transform that converts irregular nested parallelism
                  into flat data parallelism. Although the result has
                  good asymptotic performance, flattening thoroughly
                  restructures the code. Many intermediate data
                  structures and traversals are introduced, which may
                  or may not be eliminated by subsequent
                  optimisation. We present a novel program analysis to
                  identify parts of the program where flattening would
                  only introduce overhead, without appropriate
                  gain. We present empirical evidence that avoiding
                  vectorisation in these cases leads to more efficient
                  programs than if we had applied vectorisation and
                  then relied on array fusion to eliminate
                  intermediates from the resulting code.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2364512&ftid=1282873&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: accepted <2016-01-12 15:26:38>},
}

@inproceedings{Keller:2012:VA:2364506.2364512,
  author =	 {Keller, Gabriele and Chakravarty, Manuel M.T. and
                  Leshchinskiy, Roman and Lippmeier, Ben and Peyton
                  Jones, Simon},
  title =	 {Vectorisation Avoidance},
  booktitle =	 {Proceedings of the 2012 Haskell Symposium},
  series =	 {Haskell '12},
  year =	 2012,
  isbn =	 {978-1-4503-1574-6},
  location =	 {Copenhagen, Denmark},
  pages =	 {37--48},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2364506.2364512},
  doi =		 {10.1145/2364506.2364512},
  acmid =	 2364512,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {haskell, nested data parallelism, program
                  transformation},
  abstract =	 {Flattening nested parallelism is a vectorising code
                  transform that converts irregular nested parallelism
                  into flat data parallelism. Although the result has
                  good asymptotic performance, flattening thoroughly
                  restructures the code. Many intermediate data
                  structures and traversals are introduced, which may
                  or may not be eliminated by subsequent
                  optimisation. We present a novel program analysis to
                  identify parts of the program where flattening would
                  only introduce overhead, without appropriate
                  gain. We present empirical evidence that avoiding
                  vectorisation in these cases leads to more efficient
                  programs than if we had applied vectorisation and
                  then relied on array fusion to eliminate
                  intermediates from the resulting code.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2364512&ftid=1282873&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: accepted <2016-01-12 15:26:42>},
}

@inproceedings{Shafarenko:2002:CHT:571157.571160,
  author =	 {Shafarenko, Alex},
  title =	 {Coercion As Homomorphism: Type Inference in a System
                  with Subtyping and Overloading},
  booktitle =	 {Proceedings of the 4th ACM SIGPLAN International
                  Conference on Principles and Practice of Declarative
                  Programming},
  series =	 {PPDP '02},
  year =	 2002,
  isbn =	 {1-58113-528-9},
  location =	 {Pittsburgh, PA, USA},
  pages =	 {14--25},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/571157.571160},
  doi =		 {10.1145/571157.571160},
  acmid =	 571160,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {array processing, data-parallel programming,
                  overloading, subtyping, type inference},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=571160&ftid=78557&dwn=1&CFID=574762219&CFTOKEN=10899110},
  fullTextFile = {.slirm_cache/Shafarenko_2002_Coercion.pdf},
  review =	 {fbie: rejected <2016-01-12 15:29:59>},
}

@inproceedings{McDonell:2015:TRC:2804302.2804313,
  author =	 {McDonell, Trevor L. and Chakravarty, Manuel
                  M. T. and Grover, Vinod and Newton, Ryan R.},
  title =	 {Type-safe Runtime Code Generation: Accelerate to
                  LLVM},
  booktitle =	 {Proceedings of the 2015 ACM SIGPLAN Symposium on
                  Haskell},
  series =	 {Haskell '15},
  year =	 2015,
  isbn =	 {978-1-4503-3808-0},
  location =	 {Vancouver, BC, Canada},
  pages =	 {201--212},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2804302.2804313},
  doi =		 {10.1145/2804302.2804313},
  acmid =	 2804313,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Arrays, Code Generation, Data Parallelism, Dynamic
                  Compilation, Embedded Language, GPGPU, Haskell,
                  LLVM},
  abstract =	 { Embedded languages are often compiled at
                  application runtime; thus, embedded compile-time
                  errors become application runtime errors. We argue
                  that advanced type system features, such as GADTs
                  and type families, play a crucial role in minimising
                  such runtime errors. Specifically, a rigorous type
                  discipline reduces runtime errors due to bugs in
                  both embedded language applications and the
                  implementation of the embedded language compiler
                  itself. In this paper, we focus on the safety
                  guarantees achieved by type preserving
                  compilation. We discuss the compilation pipeline of
                  Accelerate, a high-performance array language
                  targeting both multicore CPUs and GPUs, where we are
                  able to preserve types from the source language down
                  to a low-level register language in SSA
                  form. Specifically, we demonstrate the
                  practicability of our approach by creating a new
                  type-safe interface to the industrial-strength LLVM
                  compiler infrastructure, which we used to build two
                  new Accelerate backends that show competitive
                  runtimes on a set of benchmarks across both CPUs and
                  GPUs. },
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2804313&ftid=1614990&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:30:36>},
}

@inproceedings{Larsen:2011:SOA:1926354.1926360,
  author =	 {Larsen, Bradford},
  title =	 {Simple Optimizations for an Applicative Array
                  Language for Graphics Processors},
  booktitle =	 {Proceedings of the Sixth Workshop on Declarative
                  Aspects of Multicore Programming},
  series =	 {DAMP '11},
  year =	 2011,
  isbn =	 {978-1-4503-0486-3},
  location =	 {Austin, Texas, USA},
  pages =	 {25--34},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/1926354.1926360},
  doi =		 {10.1145/1926354.1926360},
  acmid =	 1926360,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {array programming, cuda, gpgpu},
  abstract =	 {Graphics processors (GPUs) are highly parallel
                  devices that promise high performance, and they are
                  now flexible enough to be used for general-purpose
                  computing. A programming language based on
                  implicitly data-parallel collective array operations
                  can permit high-level, effective programming of
                  GPUs. I describe three optimizations for such a
                  language: automatic use of GPU shared memory cache,
                  array fusion, and hoisting of nested parallel
                  constructs. These optimizations are simple to
                  implement because of the design of the language to
                  which they are applied but can result in large
                  run-time speedups.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1926360&ftid=907608&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: accepted <2016-01-12 15:31:20>},
}

@article{Wernsing:2012:RHA:2370036.2145875,
  author =	 {Wernsing, John R. and Stitt, Greg},
  title =	 {RACECAR: A Heuristic for Automatic Function
                  Specialization on Multi-core Heterogeneous Systems},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {August 2012},
  volume =	 47,
  number =	 8,
  month =	 feb,
  year =	 2012,
  issn =	 {0362-1340},
  pages =	 {321--322},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/2370036.2145875},
  doi =		 {10.1145/2370036.2145875},
  acmid =	 2145875,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {elastic computing, heterogeneous, optimization,
                  racecar, speedup},
  abstract =	 {High-performance computing systems increasingly
                  combine multi-core processors and heterogeneous
                  resources such as graphics-processing units and
                  field-programmable gate arrays. However, significant
                  application design complexity for such systems has
                  often led to untapped performance
                  potential. Application designers targeting such
                  systems currently must determine how to parallelize
                  computation, create device-specialized
                  implementations for each heterogeneous resource, and
                  determine how to partition work for each
                  resource. In this paper, we present the RACECAR
                  heuristic to automate the optimization of
                  applications for multi-core heterogeneous systems by
                  automatically exploring implementation alternatives
                  that include different algorithms, parallelization
                  strategies, and work distributions. Experimental
                  results show RACECAR-specialized implementations
                  achieve speedups up to 117x and average 11x compared
                  to a single CPU thread when parallelizing
                  computation across multiple cores,
                  graphics-processing units, and field-programmable
                  gate arrays.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2145875&ftid=1145220&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:31:37>},
}

@inproceedings{Wernsing:2012:RHA:2145816.2145875,
  author =	 {Wernsing, John R. and Stitt, Greg},
  title =	 {RACECAR: A Heuristic for Automatic Function
                  Specialization on Multi-core Heterogeneous Systems},
  booktitle =	 {Proceedings of the 17th ACM SIGPLAN Symposium on
                  Principles and Practice of Parallel Programming},
  series =	 {PPoPP '12},
  year =	 2012,
  isbn =	 {978-1-4503-1160-1},
  location =	 {New Orleans, Louisiana, USA},
  pages =	 {321--322},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/2145816.2145875},
  doi =		 {10.1145/2145816.2145875},
  acmid =	 2145875,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {elastic computing, heterogeneous, optimization,
                  racecar, speedup},
  abstract =	 {High-performance computing systems increasingly
                  combine multi-core processors and heterogeneous
                  resources such as graphics-processing units and
                  field-programmable gate arrays. However, significant
                  application design complexity for such systems has
                  often led to untapped performance
                  potential. Application designers targeting such
                  systems currently must determine how to parallelize
                  computation, create device-specialized
                  implementations for each heterogeneous resource, and
                  determine how to partition work for each
                  resource. In this paper, we present the RACECAR
                  heuristic to automate the optimization of
                  applications for multi-core heterogeneous systems by
                  automatically exploring implementation alternatives
                  that include different algorithms, parallelization
                  strategies, and work distributions. Experimental
                  results show RACECAR-specialized implementations
                  achieve speedups up to 117x and average 11x compared
                  to a single CPU thread when parallelizing
                  computation across multiple cores,
                  graphics-processing units, and field-programmable
                  gate arrays.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2145875&ftid=1145220&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:31:40>},
}

@article{McDonell:2013:OPF:2544174.2500595,
  author =	 {McDonell, Trevor L. and Chakravarty, Manuel M.T. and
                  Keller, Gabriele and Lippmeier, Ben},
  title =	 {Optimising Purely Functional GPU Programs},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {September 2013},
  volume =	 48,
  number =	 9,
  month =	 sep,
  year =	 2013,
  issn =	 {0362-1340},
  pages =	 {49--60},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2544174.2500595},
  doi =		 {10.1145/2544174.2500595},
  acmid =	 2500595,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {array fusion, arrays, data parallelism, dynamic
                  compilation, embedded language, gpgpu, haskell,
                  sharing recovery},
  abstract =	 {Purely functional, embedded array programs are a
                  good match for SIMD hardware, such as GPUs. However,
                  the naive compilation of such programs quickly leads
                  to both code explosion and an excessive use of
                  intermediate data structures. The resulting
                  slow-down is not acceptable on target hardware that
                  is usually chosen to achieve high performance. In
                  this paper, we discuss two optimisation techniques,
                  sharing recovery and array fusion, that tackle code
                  explosion and eliminate superfluous intermediate
                  structures. Both techniques are well known from
                  other contexts, but they present unique challenges
                  for an embedded language compiled for execution on a
                  GPU. We present novel methods for implementing
                  sharing recovery and array fusion, and demonstrate
                  their effectiveness on a set of benchmarks.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2500595&ftid=1397521&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:32:22>},
}

@inproceedings{McDonell:2013:OPF:2500365.2500595,
  author =	 {McDonell, Trevor L. and Chakravarty, Manuel M.T. and
                  Keller, Gabriele and Lippmeier, Ben},
  title =	 {Optimising Purely Functional GPU Programs},
  booktitle =	 {Proceedings of the 18th ACM SIGPLAN International
                  Conference on Functional Programming},
  series =	 {ICFP '13},
  year =	 2013,
  isbn =	 {978-1-4503-2326-0},
  location =	 {Boston, Massachusetts, USA},
  pages =	 {49--60},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2500365.2500595},
  doi =		 {10.1145/2500365.2500595},
  acmid =	 2500595,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {array fusion, arrays, data parallelism, dynamic
                  compilation, embedded language, gpgpu, haskell,
                  sharing recovery},
  abstract =	 {Purely functional, embedded array programs are a
                  good match for SIMD hardware, such as GPUs. However,
                  the naive compilation of such programs quickly leads
                  to both code explosion and an excessive use of
                  intermediate data structures. The resulting
                  slow-down is not acceptable on target hardware that
                  is usually chosen to achieve high performance. In
                  this paper, we discuss two optimisation techniques,
                  sharing recovery and array fusion, that tackle code
                  explosion and eliminate superfluous intermediate
                  structures. Both techniques are well known from
                  other contexts, but they present unique challenges
                  for an embedded language compiled for execution on a
                  GPU. We present novel methods for implementing
                  sharing recovery and array fusion, and demonstrate
                  their effectiveness on a set of benchmarks.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2500595&ftid=1397521&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:32:27>},
}

@article{Bergstrom:2010:LTS:1932681.1863558,
  author =	 {Bergstrom, Lars and Rainey, Mike and Reppy, John and
                  Shaw, Adam and Fluet, Matthew},
  title =	 {Lazy Tree Splitting},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {September 2010},
  volume =	 45,
  number =	 9,
  month =	 sep,
  year =	 2010,
  issn =	 {0362-1340},
  pages =	 {93--104},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/1932681.1863558},
  doi =		 {10.1145/1932681.1863558},
  acmid =	 1863558,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {compilers, nested-data-parallel languages, run-time
                  systems, scheduling},
  abstract =	 {Nested data-parallelism (NDP) is a declarative style
                  for programming irregular parallel applications. NDP
                  languages provide language features favoring the NDP
                  style, efficient compilation of NDP programs, and
                  various common NDP operations like parallel maps,
                  filters, and sum-like reductions. In this paper, we
                  describe the implementation of NDP in Parallel ML
                  (PML), part of the Manticore project. Managing the
                  parallel decomposition of work is one of the main
                  challenges of implementing NDP. If the decomposition
                  creates too many small chunks of work, performance
                  will be eroded by too much parallel overhead. If, on
                  the other hand, there are too few large chunks of
                  work, there will be too much sequential processing
                  and processors will sit idle. Recently the technique
                  of Lazy Binary Splitting was proposed for dynamic
                  parallel decomposition of work on flat arrays, with
                  promising results. We adapt Lazy Binary Splitting to
                  parallel processing of binary trees, which we use to
                  represent parallel arrays in PML. We call our
                  technique Lazy Tree Splitting (LTS). One of its main
                  advantages is its performance robustness:
                  per-program tuning is not required to achieve good
                  performance across varying platforms. We describe
                  LTS-based implementations of standard NDP
                  operations, and we present experimental data
                  demonstrating the scalability of LTS across a range
                  of benchmarks.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1863558&ftid=843647&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: accepted <2016-01-12 15:33:59>},
}

@inproceedings{Bergstrom:2010:LTS:1863543.1863558,
  author =	 {Bergstrom, Lars and Rainey, Mike and Reppy, John and
                  Shaw, Adam and Fluet, Matthew},
  title =	 {Lazy Tree Splitting},
  booktitle =	 {Proceedings of the 15th ACM SIGPLAN International
                  Conference on Functional Programming},
  series =	 {ICFP '10},
  year =	 2010,
  isbn =	 {978-1-60558-794-3},
  location =	 {Baltimore, Maryland, USA},
  pages =	 {93--104},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/1863543.1863558},
  doi =		 {10.1145/1863543.1863558},
  acmid =	 1863558,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {compilers, nested-data-parallel languages, run-time
                  systems, scheduling},
  abstract =	 {Nested data-parallelism (NDP) is a declarative style
                  for programming irregular parallel applications. NDP
                  languages provide language features favoring the NDP
                  style, efficient compilation of NDP programs, and
                  various common NDP operations like parallel maps,
                  filters, and sum-like reductions. In this paper, we
                  describe the implementation of NDP in Parallel ML
                  (PML), part of the Manticore project. Managing the
                  parallel decomposition of work is one of the main
                  challenges of implementing NDP. If the decomposition
                  creates too many small chunks of work, performance
                  will be eroded by too much parallel overhead. If, on
                  the other hand, there are too few large chunks of
                  work, there will be too much sequential processing
                  and processors will sit idle. Recently the technique
                  of Lazy Binary Splitting was proposed for dynamic
                  parallel decomposition of work on flat arrays, with
                  promising results. We adapt Lazy Binary Splitting to
                  parallel processing of binary trees, which we use to
                  represent parallel arrays in PML. We call our
                  technique Lazy Tree Splitting (LTS). One of its main
                  advantages is its performance robustness:
                  per-program tuning is not required to achieve good
                  performance across varying platforms. We describe
                  LTS-based implementations of standard NDP
                  operations, and we present experimental data
                  demonstrating the scalability of LTS across a range
                  of benchmarks.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1863558&ftid=843647&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: accepted <2016-01-12 15:34:05>},
}

@article{Quillere:2000:OMU:365151.365152,
  author =	 {Quiller{\'e}, Fabien and Rajopadhye, Sanjay},
  title =	 {Optimizing Memory Usage in the Polyhedral Model},
  journal =	 {ACM Trans. Program. Lang. Syst.},
  issue_date =	 {Sept. 2000},
  volume =	 22,
  number =	 5,
  month =	 sep,
  year =	 2000,
  issn =	 {0164-0925},
  pages =	 {773--815},
  numpages =	 43,
  url =		 {http://doi.acm.org/10.1145/365151.365152},
  doi =		 {10.1145/365151.365152},
  acmid =	 365152,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {affine recurrence equations, applicative
                  (functional) languages, automatic parallelization,
                  data-parallel languages, dataflow analysis,
                  dependence analysis, lifetime analysis, memory
                  management, parallel code generation, polyhedral
                  model, scheduling},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=365152&ftid=47708&dwn=1&CFID=574762219&CFTOKEN=10899110},
  fullTextFile = {.slirm_cache/Quiller{\'e}_2000_Optimizing.pdf},
  review =	 {fbie: accepted <2016-01-12 15:35:20>},
}

@inproceedings{Sarkar:2008:COP:1356058.1356087,
  author =	 {Sarkar, Vivek},
  title =	 {Code Optimization of Parallel Programs: Evolutionary
                  vs. Revolutionary Approaches},
  booktitle =	 {Proceedings of the 6th Annual IEEE/ACM International
                  Symposium on Code Generation and Optimization},
  series =	 {CGO '08},
  year =	 2008,
  isbn =	 {978-1-59593-978-4},
  location =	 {Boston, MA, USA},
  pages =	 {1--1},
  numpages =	 1,
  url =		 {http://doi.acm.org/10.1145/1356058.1356087},
  doi =		 {10.1145/1356058.1356087},
  acmid =	 1356087,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {code optimization, multicore processors, parallel
                  programs},
  abstract =	 {Code optimization has a rich history that dates back
                  over half a century. Over the years, it has
                  contributed deep innovations to address challenges
                  posed by new computer system and programming
                  language features. Examples of the former include
                  optimizations for improved register utilization,
                  instruction-level parallelism, vector parallelism,
                  multiprocessor parallelism and memory hierarchy
                  utilization. Examples of the latter include
                  optimizations for procedural, object-oriented,
                  functional and domain-specific languages as well as
                  dynamic optimization for managed runtimes. These
                  optimizations have contributed significantly to
                  programmer productivity by reducing the effort that
                  programmers need to spend on hand-implementing code
                  optimizations and by enabling code to be more
                  portable, especially as programming models and
                  computer architectures change. While compiler
                  frameworks are often able to incorporate new code
                  optimizations in an evolutionary manner, there have
                  been notable periods in the history of compilers
                  when more revolutionary changes were
                  necessary. Examples of such paradigm shifts in the
                  history of compilers include interprocedural whole
                  program analysis, coloring-based register
                  allocation, static single assignment form, array
                  dependence analysis, pointer alias analysis, loop
                  transformations, adaptive profile-directed
                  optimizations, and dynamic compilation. The
                  revolutionary nature of these shifts is evidenced by
                  the fact that production-strength optimization
                  frameworks (especially those in industry) had to be
                  rewritten from scratch or significantly modified to
                  support the new capabilities. In this talk, we claim
                  that the current multicore trend in the computer
                  industry is forcing a new paradigm shift in
                  compilers to address the challenge of code
                  optimization of parallel programs, regardless of
                  whether the parallelism is implicit or explicit in
                  the programming model. All computers --- embedded,
                  mainstream, and high-end --- are now being built
                  from multicore processors with little or no increase
                  in clock speed per core. This trend poses multiple
                  challenges for compilers for future systems as the
                  number of cores per socket continues to grow, and
                  the cores become more heterogeneous. In addition,
                  compilers have to keep pace with emerging parallel
                  programming models embodied in a proliferation of
                  new libraries and new languages. To substantiate our
                  claim, we examine the historical foundations of code
                  optimization including intermediate representations
                  (IR's), abstract execution models, legality and cost
                  analyses of IR transformations and show that they
                  are all deeply entrenched in the von Neumann model
                  of sequential computing. We discuss ongoing
                  evolutionary efforts to support optimization of
                  parallel programs in the context of existing
                  compiler frameworks, and their inherent limitations
                  for the long term. We then outline what a
                  revolutionary approach will entail, and identify
                  where its underlying paradigm shifts are likely to
                  lie. We provide examples of past research that are
                  likely to influence future directions in code
                  optimization of parallel programs such as program
                  dependence graphs, partitioning and scheduling of
                  lightweight parallelism, synchronization
                  optimizations, communication optimizations,
                  transactional memory optimizations, code generation
                  for heterogeneous accelerators, impact of memory
                  models on code optimization, and general forms of
                  data and computation alignment. Finally, we briefly
                  describe the approach to code optimization of
                  parallel programs being taken in the Habanero
                  Multicore Software Research project at Rice
                  University.},
  review =	 {fbie: rejected <2016-01-12 15:38:07>},
}

@inproceedings{Grelck:2013:NGA:2620678.2620690,
  author =	 {Grelck, Clemens and Wiesinger, Heinrich},
  title =	 {Next Generation Asynchronous Adaptive Specialization
                  for Data-Parallel Functional Array Processing in
                  SAC: Accelerating the Availability of Specialized
                  High Performance Code},
  booktitle =	 {Proceedings of the 25th Symposium on Implementation
                  and Application of Functional Languages},
  series =	 {IFL '13},
  year =	 2014,
  isbn =	 {978-1-4503-2988-0},
  location =	 {Nijmegen, Netherlands},
  pages =	 {117:117--117:128},
  articleno =	 117,
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2620678.2620690},
  doi =		 {10.1145/2620678.2620690},
  acmid =	 2620690,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Array processing, Single Assignment C, dynamic
                  compilation, rank and shape specialization, runtime
                  optimization},
  abstract =	 {Data-parallel processing of multi-dimensional
                  functional/immutable arrays is characterized by a
                  fundamental trade-off between software engineering
                  principles on the one hand and runtime performance
                  concerns on the other hand. Whereas the former
                  demand code to be written in a generic style
                  abstracting from structural properties of arrays as
                  much as possible, the latter require an optimizing
                  compiler to have as much information on the very
                  same structural properties available at compile
                  time. Asynchronous adaptive specialization of
                  generic code to specific data to be processed at
                  application runtime has proven to be an effective
                  way to reconcile these contrarian demands. In this
                  paper we revisit asynchronous adaptive
                  specialization in the context of the functional
                  data-parallel array language SaC. We provide a
                  comprehensive analysis of its strengths and
                  weaknesses and propose improvements for its design
                  and implementation. These improvements are primarily
                  concerned with making specializations available to
                  running applications as quickly as possible. We
                  propose four complementary measures to this
                  effect. Bulk adaptive specialization speculatively
                  waits for future specialization requests to
                  materialize instead of addressing each request
                  individually. Prioritized adaptive specialization
                  aims at selecting the most profitable
                  specializations first. Parallel adaptive
                  specialization reserves multiple cores for
                  specialization and, thus, computes multiple
                  specializations simultaneously. Last but not least,
                  persistent adaptive specialization preserves
                  specializations across independent program runs and
                  even across unrelated applications.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2620690&ftid=1509730&dwn=1&CFID=574762219&CFTOKEN=10899110},
  review =	 {fbie: rejected <2016-01-12 15:40:22>},
}

@inproceedings{Spliet:2014:KDM:2588768.2576781,
  author =	 {Spliet, Roy and Howes, Lee and Gaster, Benedict
                  R. and Varbanescu, Ana Lucia},
  title =	 {KMA: A Dynamic Memory Manager for OpenCL},
  booktitle =	 {Proceedings of Workshop on General Purpose
                  Processing Using GPUs},
  series =	 {GPGPU-7},
  year =	 2014,
  isbn =	 {978-1-4503-2766-4},
  location =	 {Salt Lake City, UT, USA},
  pages =	 {9:9--9:18},
  articleno =	 9,
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/2576779.2576781},
  doi =		 {10.1145/2576779.2576781},
  acmid =	 2576781,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Dynamic memory allocation, Massive parallelism,
                  Multi-/many-cores, OpenCL kernels},
  abstract =	 {OpenCL is becoming a popular choice for the parallel
                  programming of both multi-core CPUs and GPGPUs. One
                  of the features missing in OpenCL, yet commonly
                  required in irregular parallel applications, is
                  dynamic memory allocation. In this paper, we propose
                  KMA, a first dynamic memory allocator for
                  OpenCL. KMA's design is based on a thorough analysis
                  of a set of 11 algorithms, which shows that dynamic
                  memory allocation is a necessary commodity,
                  typically used for implementing complex data
                  structures (arrays, lists, trees) that need constant
                  restructuring at runtime. Taking into account both
                  the survey findings and the status-quo of OpenCL, we
                  design KMA as a two-layer memory manager that makes
                  smart use of the patterns we identified in our
                  application analysis: its basic functionality
                  provides generic malloc() and free() APIs, while the
                  higher layer provides support for building and
                  efficiently managing dynamic data structures. Our
                  experiments measure the performance and usability of
                  KMA, using both microbenchmarks and a real-life
                  case-study. Results show that when dynamic
                  allocation is mandatory, KMA is a competitive
                  allocator. We conclude that embedding dynamic memory
                  allocation in OpenCL is feasible, but it is a
                  complex, delicate task due to the massive
                  parallelism of the platform and the portability
                  issues between different OpenCL implementations.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2576781&ftid=1434818&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 15:40:32>},
}

@inproceedings{Murthy:2008:PCX:1370082.1370086,
  author =	 {Murthy, PVR},
  title =	 {Parallel Computing with x10},
  booktitle =	 {Proceedings of the 1st International Workshop on
                  Multicore Software Engineering},
  series =	 {IWMSE '08},
  year =	 2008,
  isbn =	 {978-1-60558-031-9},
  location =	 {Leipzig, Germany},
  pages =	 {5--6},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/1370082.1370086},
  doi =		 {10.1145/1370082.1370086},
  acmid =	 1370086,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {X10, atomic blocks, clocks, data distribution, java,
                  multi-threading, non-uniform cluster
                  computing(NUCC), partitioned global address space
                  (PGAS), places, scalability},
  abstract =	 {Many problems require parallel solutions and
                  implementations and how to extract and specify
                  parallelism has been the focus of Research during
                  the last few decades. While there has been a
                  significant progress in terms of (a)automatically
                  deriving implicit parallelism from functional and
                  logic programs, (b) using parallelizing compilers to
                  extract parallelism from serial programs written in
                  Fortran or C mainly by parallelizing loop constructs
                  and (c) evolution of standards such as Message
                  Passing Interface (MPI) to allow a Fortran or C
                  programmer to decompose a problem into a parallel
                  solution, the parallel computing problem is still
                  not solved completely. With the emergence of
                  parallel computing architectures based on multi-core
                  chips, there is a need to rewrite existing software
                  and also develop future software so that parallelism
                  available at the hardware level is fully
                  exploited. Executing concurrent or distributed
                  programs using modern object-oriented programming
                  languages such as Java and C# is possible on two
                  platforms: 1. a uniprocessor or shared memory
                  multiprocessor system on which one or more threads
                  execute against a single shared heap in a single
                  virtual machine and 2. a loosely coupled distributed
                  computing system in which each node has its own
                  virtual machine and communicates with other nodes
                  using protocols such as RMI. Computer systems are
                  already consisting of and will have multicore SMP
                  nodes with non-uniform memory hierarchies
                  interconnected in horizontally scalable cluster
                  configurations. Since the current High Performance
                  Computing programming models do not support the
                  notions of a non-uniform data access or of tight
                  coupling of distributed nodes, the models are
                  ineffective in addressing the needs of such a
                  system. As a consequence, X10 is proposed [1,
                  2]. The target machine for the execution of an X10
                  program may range from a uniprocessor machine to a
                  large cluster of parallel processors supporting
                  millions of concurrent operations. The design goals
                  of X10 are to achieve a balance among Safety,
                  Analyzability, Scalability and Flexibility. The X10
                  programming model uses the serial subset of Java and
                  introduces new features to ensure that a suitable
                  expression of parallelism is the basis for
                  exploiting the modern computer architectures. X10
                  introduces a Partitioned Global Address Space (PGAS)
                  that materializes as locality in the form of
                  places. To provide a foundation for concurrency
                  constructs in the language, dynamic and asynchronous
                  activities are introduced in X10. To support dense
                  and sparse distributed multi-dimensional arrays, X10
                  introduces a rich array sub-language. The Java
                  programming model uses the notion of a single
                  uniform heap and this is a limitation in using the
                  language on non-uniform cluster computing
                  systems. Scalability problems are reported in trying
                  to automatically map a uniform heap onto a
                  non-uniform cluster. Places in X10 attempt to
                  address the scalability issue by letting an X10
                  programmer decide which objects and activities are
                  co-located. To be able to create light-weight
                  threads locally or remotely, X10 introduces the
                  notion of asynchronous activities. The corresponding
                  mechanisms in Java are heavy weight. The language
                  constructs async, future, foreach, ateach, finish,
                  clocks and atomic blocks are designed to co-ordinate
                  asynchronous activities in an X10 program. The
                  elements of an array are distributed across multiple
                  places in the partitioned global address space based
                  on the array's distribution
                  specification. Throughout the program's execution,
                  the distribution remains unchanged. The issues of
                  locality and distribution cannot be hidden from a
                  programmer of high-performance code and X10 reflects
                  this in its design choices. To illustrate X10's
                  features to implement concurrent and distributed
                  computations, sample programs are discussed.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1370086&ftid=515419&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: accepted <2016-01-12 15:44:49>},
}

@inproceedings{Cohen:2013:POC:2443608.2443609,
  author =	 {Cohen, Albert},
  title =	 {Parallelizing and Optimizing Compilation for
                  Synchronous Languages: New Directions for
                  High-performance Embedded Systems},
  booktitle =	 {Proceedings of the 10th Workshop on Optimizations
                  for DSP and Embedded Systems},
  series =	 {ODES '13},
  year =	 2013,
  isbn =	 {978-1-4503-1905-8},
  location =	 {Shenzhen, China},
  pages =	 {1--1},
  numpages =	 1,
  url =		 {http://doi.acm.org/10.1145/2443608.2443609},
  doi =		 {10.1145/2443608.2443609},
  acmid =	 2443609,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {multicore, parallelization, reactive systems},
  abstract =	 {Synchronous data-flow programming appeared in the
                  80s, with Lustre and Signal, applied to the design,
                  modeling, and programming of safety-critical
                  embedded systems. Its scientific and industrial
                  success derives from the ability to master the high
                  levels of concurrency in reactive systems, while
                  pre-serving a functionally deterministic semantics,
                  with liveness, bounded memory, and bounded execution
                  time guarantees. Recent work on Lucid Synchrone and
                  Scade version 6 led to important progress in
                  expressiveness, from automatic type inference to the
                  modular construction of complex systems combining
                  state machines and data-flow equations. Powerful
                  compilation techniques have also been designed that
                  are capable of generating sequential code
                  competitive with low-level C programming. As
                  multicore embedded processors and MPSoC become
                  unavoidable, even in safety-critical environments,
                  and as control applications become increasingly
                  computational, new challenges emerge in the
                  parallelization and optimization of synchronous
                  programs. In this talk, we will survey recent and
                  ongoing work on generating parallel code and highly
                  efficient nested loops with in-place, array-based
                  computations from data-flow synchronous
                  programs. This survey will take us through
                  programming language design, data structure and
                  stream abstraction, compilation algorithms, and
                  runtime systems, with a focus on real-time embedded
                  systems.},
  review =	 {fbie: rejected <2016-01-12 16:03:06>},
}

@inproceedings{Cave:2011:HNA:2093157.2093165,
  author =	 {Cav{\'e}, Vincent and Zhao, Jisheng and Shirako, Jun
                  and Sarkar, Vivek},
  title =	 {Habanero-Java: The New Adventures of Old X10},
  booktitle =	 {Proceedings of the 9th International Conference on
                  Principles and Practice of Programming in Java},
  series =	 {PPPJ '11},
  year =	 2011,
  isbn =	 {978-1-4503-0935-6},
  location =	 {Kongens Lyngby, Denmark},
  pages =	 {51--61},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/2093157.2093165},
  doi =		 {10.1145/2093157.2093165},
  acmid =	 2093165,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {data-race detection, language, parallel programming},
  abstract =	 {In this paper, we present the Habanero-Java (HJ)
                  language developed at Rice University as an
                  extension to the original Java-based definition of
                  the X10 language. HJ includes a powerful set of
                  task-parallel programming constructs that can be
                  added as simple extensions to standard Java programs
                  to take advantage of today's multi-core and
                  heterogeneous architectures. The language puts a
                  particular emphasis on the usability and safety of
                  parallel constructs. For example, no HJ program
                  using async, finish, isolated, and phaser constructs
                  can create a logical deadlock cycle. In addition,
                  the future and data-driven task variants of the
                  async construct facilitate a functional approach to
                  parallel programming. Finally, any HJ program
                  written with async, finish, and phaser constructs
                  that is data-race free is guaranteed to also be
                  deterministic. HJ also features two key enhancements
                  that address well known limitations in the use of
                  Java in scientific computing --- the inclusion of
                  complex numbers as a primitive data type, and the
                  inclusion of array-views that support
                  multidimensional views of one-dimensional
                  arrays. The HJ compiler generates standard Java
                  class-files that can run on any JVM for Java 5 or
                  higher. The HJ runtime is responsible for
                  orchestrating the creation, execution, and
                  termination of HJ tasks, and features both
                  work-sharing and work-stealing schedulers. HJ is
                  used at Rice University as an introductory parallel
                  programming language for second-year undergraduate
                  students. A wide variety of benchmarks have been
                  ported to HJ, including a full application that was
                  originally written in Fortran 90. HJ has a rich
                  development and runtime environment that includes
                  integration with DrJava, the addition of a data race
                  detection tool, and service as a target platform for
                  the Intel Concurrent Collections coordination
                  language},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2093165&ftid=1078704&dwn=1&CFID=574773947&CFTOKEN=83431304},
  fullTextFile = {.slirm_cache/Cav{\'e}_2011_Habanero.pdf},
  review =	 {fbie: rejected <2016-01-12 16:05:36>},
}

@inproceedings{Li:2014:EEP:2627373.2627385,
  author =	 {Li, Shuo and Geva, Robert},
  title =	 {Extract and Extend Parallelism Using C/C++ Extension
                  for Array Notation on Multicore and Many-core
                  Platforms: An Empirical Investigation with
                  Quantitative Finance Examples},
  booktitle =	 {Proceedings of ACM SIGPLAN International Workshop on
                  Libraries, Languages, and Compilers for Array
                  Programming},
  series =	 {ARRAY'14},
  year =	 2014,
  isbn =	 {978-1-4503-2937-8},
  location =	 {Edinburgh, United Kingdom},
  pages =	 {68:68--68:75},
  articleno =	 68,
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/2627373.2627385},
  doi =		 {10.1145/2627373.2627385},
  acmid =	 2627385,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract =	 {In this paper, we explore the newly introduced array
                  notion syntax extension in recent release of Intel
                  Compiler with a few representative quantitative
                  finance workloads. We will explore the array syntax
                  both as an abstraction tool to allow the user to
                  succinctly express the intended operations and as a
                  performance tool that facilitates the most efficient
                  implementation which can take advantage of the
                  parallel hardware resource such as vector processing
                  units and ever increasing number of processor
                  cores. We specifically look at how these new array
                  style programming capability can help the financial
                  modeler and software developers to extract the
                  parallelism from the numerical algorithm and extend
                  it from multicore host processor to a hybrid of
                  multicore many-core accelerated computing
                  environment. We start with a functional introduction
                  to the C++ array notation syntax that will be used
                  in the subsequent examples. We, then, present
                  background information on a few derivative pricing
                  algorithms in quantitative finance. For each
                  algorithm, we present a scalar program first and
                  take a performance measurement as baseline. As we
                  choose to use the array programming mechanism, we
                  will look at the programming language related issues
                  and postulate what syntax motivate the developer to
                  use then what the alternative syntax are and why
                  some might be more popular than others. Then we look
                  at the performance related issues and look at the
                  code generation on Intel Architecture based
                  multicore and many-core platforms, and investigate
                  mechanism for performance optimization. We conclude
                  the paper by creating a hybrid program that runs
                  both on multicore and many-core environment,
                  concurrently.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2627385&ftid=1503136&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:08:02>},
}

@inproceedings{Chatterjee:1993:AAA:158511.158517,
  author =	 {Chatterjee, Siddhartha and Gilbert, John R. and
                  Schreiber, Robert and Teng, Shang-Hua},
  title =	 {Automatic Array Alignment in Data-parallel Programs},
  booktitle =	 {Proceedings of the 20th ACM SIGPLAN-SIGACT Symposium
                  on Principles of Programming Languages},
  series =	 {POPL '93},
  year =	 1993,
  isbn =	 {0-89791-560-7},
  location =	 {Charleston, South Carolina, USA},
  pages =	 {16--28},
  numpages =	 13,
  url =		 {http://doi.acm.org/10.1145/158511.158517},
  doi =		 {10.1145/158511.158517},
  acmid =	 158517,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=158517&ftid=33072&dwn=1&CFID=574773947&CFTOKEN=83431304},
  fullTextFile = {.slirm_cache/Chatterjee_1993_Automatic.pdf},
  notes =	 {Very unsure about this one.},
  abstract = 	 {

             Data-parallel languages like Fortran 90 express parallelism in the form of operations on data aggregates such as arrays. Misalignment of the operands of an array operation can reduce program performance on a distributed-memory parallel machine by requiring nonlocal data accesses. Determining array alignments that reduce communication is therefore a key issue in compiling such languages.
We present a framework for the automatic determination of array alignments in data-parallel languages such as Fortran 90. Our language model handles array sectioning, reductions, spreads, transpositions, and masked operations. We decompose alignment functions into three constituents: axis, stride, and offset. For each of these subproblems, we show how to solve the alignment problem for a   basic block of code, possibly containing common subexpressions. Alignments are generated for all array objects in the code, both named program variables and intermediate results. The alignments obtained by our algorithms are more general than those provided by the &ldquo;owner-computes&rdquo; rule. Finally, we present some ideas for dealing with control flow, replication, and dynamic alignments that depend on loop induction variables. },

}

@article{Raman:2012:PSF:2345156.2254082,
  author =	 {Raman, Arun and Zaks, Ayal and Lee, Jae W. and
                  August, David I.},
  title =	 {Parcae: A System for Flexible Parallel Execution},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {June 2012},
  volume =	 47,
  number =	 6,
  month =	 jun,
  year =	 2012,
  issn =	 {0362-1340},
  pages =	 {133--144},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2345156.2254082},
  doi =		 {10.1145/2345156.2254082},
  acmid =	 2254082,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {adaptivity, automatic parallelization, code
                  generation, compiler, flexible, multicore, parallel,
                  performance portability, run-time, tuning},
  abstract =	 {Workload, platform, and available resources
                  constitute a parallel program's execution
                  environment. Most parallelization efforts statically
                  target an anticipated range of environments, but
                  performance generally degrades outside that
                  range. Existing approaches address this problem with
                  dynamic tuning but do not optimize a multiprogrammed
                  system holistically. Further, they either require
                  manual programming effort or are limited to
                  array-based data-parallel programs. This paper
                  presents Parcae, a generally applicable automatic
                  system for platform-wide dynamic tuning. Parcae
                  includes (i) the Nona compiler, which creates
                  flexible parallel programs whose tasks can be
                  efficiently reconfigured during execution; (ii) the
                  Decima monitor, which measures resource availability
                  and system performance to detect change in the
                  environment; and (iii) the Morta executor, which
                  cuts short the life of executing tasks, replacing
                  them with other functionally equivalent tasks better
                  suited to the current environment. Parallel programs
                  made flexible by Parcae outperform original parallel
                  implementations in many interesting scenarios.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2254082&ftid=1235822&dwn=1&CFID=574773947&CFTOKEN=83431304},
  fullTextFile = {.slirm_cache/Raman_2012_Parcae.pdf},
  review =	 {fbie: rejected <2016-01-12 16:15:58>},
}

@inproceedings{Raman:2012:PSF:2254064.2254082,
  author =	 {Raman, Arun and Zaks, Ayal and Lee, Jae W. and
                  August, David I.},
  title =	 {Parcae: A System for Flexible Parallel Execution},
  booktitle =	 {Proceedings of the 33rd ACM SIGPLAN Conference on
                  Programming Language Design and Implementation},
  series =	 {PLDI '12},
  year =	 2012,
  isbn =	 {978-1-4503-1205-9},
  location =	 {Beijing, China},
  pages =	 {133--144},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2254064.2254082},
  doi =		 {10.1145/2254064.2254082},
  acmid =	 2254082,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {adaptivity, automatic parallelization, code
                  generation, compiler, flexible, multicore, parallel,
                  performance portability, run-time, tuning},
  abstract =	 {Workload, platform, and available resources
                  constitute a parallel program's execution
                  environment. Most parallelization efforts statically
                  target an anticipated range of environments, but
                  performance generally degrades outside that
                  range. Existing approaches address this problem with
                  dynamic tuning but do not optimize a multiprogrammed
                  system holistically. Further, they either require
                  manual programming effort or are limited to
                  array-based data-parallel programs. This paper
                  presents Parcae, a generally applicable automatic
                  system for platform-wide dynamic tuning. Parcae
                  includes (i) the Nona compiler, which creates
                  flexible parallel programs whose tasks can be
                  efficiently reconfigured during execution; (ii) the
                  Decima monitor, which measures resource availability
                  and system performance to detect change in the
                  environment; and (iii) the Morta executor, which
                  cuts short the life of executing tasks, replacing
                  them with other functionally equivalent tasks better
                  suited to the current environment. Parallel programs
                  made flexible by Parcae outperform original parallel
                  implementations in many interesting scenarios.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2254082&ftid=1235822&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:16:06>},
}

@inproceedings{O'Donnell:2013:ESF:2505879.2505891,
  author =	 {O'Donnell, John T.},
  title =	 {Extensible Sparse Functional Arrays with Circuit
                  Parallelism},
  booktitle =	 {Proceedings of the 15th Symposium on Principles and
                  Practice of Declarative Programming},
  series =	 {PPDP '13},
  year =	 2013,
  isbn =	 {978-1-4503-2154-9},
  location =	 {Madrid, Spain},
  pages =	 {133--144},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2505879.2505891},
  doi =		 {10.1145/2505879.2505891},
  acmid =	 2505891,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {circuit parallelism, extensible array, functional
                  array, functional programming, sparse array},
  abstract =	 {A longstanding open question in algorithms and data
                  structures is the time and space complexity of pure
                  functional arrays. Imperative arrays provide update
                  and lookup operations that require constant time in
                  the RAM theoretical model, but it is conjectured
                  that there does not exist a RAM algorithm that
                  achieves the same complexity for functional arrays,
                  unless restrictions are placed on the
                  operations. The main result of this paper is an
                  algorithm that does achieve optimal unit time and
                  space complexity for update and lookup on functional
                  arrays. This algorithm does not run on a RAM, but
                  instead it exploits the massive parallelism inherent
                  in digital circuits. The algorithm also provides
                  unit time operations that support storage
                  management, as well as sparse and extensible
                  arrays. The main idea behind the algorithm is to
                  replace a RAM memory by a tree circuit that is more
                  powerful than the RAM yet has the same asymptotic
                  complexity in time (gate delays) and size (number of
                  components). The algorithm uses an array
                  representation that allows elements to be shared
                  between many arrays with only a small constant
                  factor penalty in space and time. This system
                  exemplifies circuit parallelism, which exploits very
                  large numbers of transistors per chip in order to
                  speed up key algorithms. Extensible Sparse
                  Functional Arrays (ESFA) can be used with both
                  functional and imperative programming languages. The
                  system comprises a set of algorithms and a circuit
                  specification, and it has been implemented on a
                  GPGPU with good performance.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2505891&ftid=1397415&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:17:18>},
}

@article{Willhoft:1995:DDP:216800.216803,
  author =	 {Willhoft, Robert Gordon},
  title =	 {DPL: A Data Parallel Language for the Expression and
                  Execution of General Parallel Algorithm},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {Sept. 1995},
  volume =	 26,
  number =	 1,
  month =	 sep,
  year =	 1995,
  issn =	 {0163-6006},
  pages =	 {12--13},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/216800.216803},
  doi =		 {10.1145/216800.216803},
  acmid =	 216803,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  review =	 {fbie: rejected <2016-01-12 16:18:22>},
}

@article{Blelloch:2010:FPA:1932681.1863579,
  author =	 {Blelloch, Guy E.},
  title =	 {Functional Parallel Algorithms},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {September 2010},
  volume =	 45,
  number =	 9,
  month =	 sep,
  year =	 2010,
  issn =	 {0362-1340},
  pages =	 {247--247},
  numpages =	 1,
  url =		 {http://doi.acm.org/10.1145/1932681.1863579},
  doi =		 {10.1145/1932681.1863579},
  acmid =	 1863579,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {functional programming, parallel algorithms},
  abstract =	 {Functional programming presents several important
                  advantages in the design, analysis and
                  implementation of parallel algorithms: It
                  discourages iteration and encourages
                  decomposition. It supports persistence and hence
                  easy speculation. It encourages higher-order
                  aggregate operations. It is well suited for defining
                  cost models tied to the programming language rather
                  than the machine. Implementations can avoid false
                  sharing. Implementations can use cheaper weak
                  consistency models. And most importantly, it
                  supports safe deterministic parallelism. In fact
                  functional programming supports a level of
                  abstraction in which parallel algorithms are often
                  as easy to design and analyze as sequential
                  algorithms. The recent widespread advent of parallel
                  machines therefore presents a great opportunity for
                  functional programming languages. However, any
                  changes will require significant education at all
                  levels and involvement of the functional programming
                  community. In this talk I will discuss an approach
                  to designing and analyzing parallel algorithms in a
                  strict functional and fully deterministic
                  setting. Key ideas include a cost model defined in
                  term of analyzing work and span, the use of
                  divide-and-conquer and contraction, the need for
                  arrays (immutable) to achieve asymptotic efficiency,
                  and the power of (deterministic) randomized
                  algorithms. These are all ideas I believe can be
                  taught at any level.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1863579&ftid=978044&dwn=1&CFID=574773947&CFTOKEN=83431304},
  notes = 	 {This is a talk, keep it?},
  review = 	 {fbie: rejected <2016-01-13 16:25:50>},
}

@inproceedings{Blelloch:2010:FPA:1863543.1863579,
  author =	 {Blelloch, Guy E.},
  title =	 {Functional Parallel Algorithms},
  booktitle =	 {Proceedings of the 15th ACM SIGPLAN International
                  Conference on Functional Programming},
  series =	 {ICFP '10},
  year =	 2010,
  isbn =	 {978-1-60558-794-3},
  location =	 {Baltimore, Maryland, USA},
  pages =	 {247--247},
  numpages =	 1,
  url =		 {http://doi.acm.org/10.1145/1863543.1863579},
  doi =		 {10.1145/1863543.1863579},
  acmid =	 1863579,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {functional programming, parallel algorithms},
  abstract =	 {Functional programming presents several important
                  advantages in the design, analysis and
                  implementation of parallel algorithms: It
                  discourages iteration and encourages
                  decomposition. It supports persistence and hence
                  easy speculation. It encourages higher-order
                  aggregate operations. It is well suited for defining
                  cost models tied to the programming language rather
                  than the machine. Implementations can avoid false
                  sharing. Implementations can use cheaper weak
                  consistency models. And most importantly, it
                  supports safe deterministic parallelism. In fact
                  functional programming supports a level of
                  abstraction in which parallel algorithms are often
                  as easy to design and analyze as sequential
                  algorithms. The recent widespread advent of parallel
                  machines therefore presents a great opportunity for
                  functional programming languages. However, any
                  changes will require significant education at all
                  levels and involvement of the functional programming
                  community. In this talk I will discuss an approach
                  to designing and analyzing parallel algorithms in a
                  strict functional and fully deterministic
                  setting. Key ideas include a cost model defined in
                  term of analyzing work and span, the use of
                  divide-and-conquer and contraction, the need for
                  arrays (immutable) to achieve asymptotic efficiency,
                  and the power of (deterministic) randomized
                  algorithms. These are all ideas I believe can be
                  taught at any level.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1863579&ftid=978044&dwn=1&CFID=574773947&CFTOKEN=83431304},
  notes =	 {This is a talk, but it's highly relevant. Don't know
                  how to handle it right now.},
  review = 	 {fbie: rejected <2016-01-13 16:25:51>},
}

@article{Deitz:2003:DIP:966049.781526,
  author =	 {Deitz, Steven J. and Chamberlain, Bradford L. and
                  Choi, Sung-Eun and Snyder, Lawrence},
  title =	 {The Design and Implementation of a Parallel Array
                  Operator for the Arbitrary Remapping of Data},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {October 2003},
  volume =	 38,
  number =	 10,
  month =	 jun,
  year =	 2003,
  issn =	 {0362-1340},
  pages =	 {155--166},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/966049.781526},
  doi =		 {10.1145/966049.781526},
  acmid =	 781526,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {ZPL, array languages, gather, parallel programming,
                  scatter},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=781526&ftid=156280&dwn=1&CFID=574965416&CFTOKEN=24274792},
  abstract = 	 {

             Gather and scatter are data redistribution functions of long-standing importance to high performance computing. In this paper, we present a highly-general array operator with powerful gather and scatter capabilities unmatched by other array languages. We discuss an efficient parallel implementation, introducing three new optimizations---schedule compression, dead array reuse, and direct communication---that reduce the costs associated with the operator's wide applicability. In our implementation of this operator in ZPL, we demonstrate performance comparable to the hand-coded Fortran + MPI versions of the NAS FT and CG benchmarks. },
}

@inproceedings{Deitz:2003:DIP:781498.781526,
  author =	 {Deitz, Steven J. and Chamberlain, Bradford L. and
                  Choi, Sung-Eun and Snyder, Lawrence},
  title =	 {The Design and Implementation of a Parallel Array
                  Operator for the Arbitrary Remapping of Data},
  booktitle =	 {Proceedings of the Ninth ACM SIGPLAN Symposium on
                  Principles and Practice of Parallel Programming},
  series =	 {PPoPP '03},
  year =	 2003,
  isbn =	 {1-58113-588-2},
  location =	 {San Diego, California, USA},
  pages =	 {155--166},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/781498.781526},
  doi =		 {10.1145/781498.781526},
  acmid =	 781526,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {ZPL, array languages, gather, parallel programming,
                  scatter},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=781526&ftid=156280&dwn=1&CFID=574773947&CFTOKEN=83431304},
  fullTextFile = {.slirm_cache/Deitz_2003_The.pdf},
  review =	 {fbie: accepted <2016-01-12 16:24:42>},
  abstract = 	 {

             Gather and scatter are data redistribution functions of long-standing importance to high performance computing. In this paper, we present a highly-general array operator with powerful gather and scatter capabilities unmatched by other array languages. We discuss an efficient parallel implementation, introducing three new optimizations---schedule compression, dead array reuse, and direct communication---that reduce the costs associated with the operator's wide applicability. In our implementation of this operator in ZPL, we demonstrate performance comparable to the hand-coded Fortran + MPI versions of the NAS FT and CG benchmarks. },
}

@inproceedings{Nieplocha:2005:EPG:1062261.1062305,
  author =	 {Nieplocha, Jarek and Krishnan, Manoj and Palmer,
                  Bruce and Tipparaju, Vinod and Zhang, Yeliang},
  title =	 {Exploiting Processor Groups to Extend Scalability of
                  the GA Shared Memory Programming Model},
  booktitle =	 {Proceedings of the 2Nd Conference on Computing
                  Frontiers},
  series =	 {CF '05},
  year =	 2005,
  isbn =	 {1-59593-019-1},
  location =	 {Ischia, Italy},
  pages =	 {262--272},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/1062261.1062305},
  doi =		 {10.1145/1062261.1062305},
  acmid =	 1062305,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {extreme scalability, global arrays, multi-level
                  parallelism, processor groups},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1062305&ftid=315458&dwn=1&CFID=574773947&CFTOKEN=83431304},
  fullTextFile = {.slirm_cache/Nieplocha_2005_Exploiting.pdf},
  review =	 {fbie: rejected <2016-01-12 16:26:21>},
  abstract = 	 {

             Exploiting processor groups is becoming increasingly important for programming next-generation high-end systems composed of tens or hundreds of thousands of processors. This paper discusses the requirements, functionality and development of multilevel-parallelism based on processor groups in the context of the Global Array (GA) shared memory programming model. The main effort involves management of shared data, rather than interprocessor communication. Experimental results for the NAS NPB Conjugate Gradient benchmark and a molecular dynamics (MD) application are presented for a Linux cluster with Myrinet and illustrate the value of the proposed approach for improving scalability. While the original GA version of the CG benchmark lagged MPI, the processor-group version outperforms MPI in all cases, except for a few points on the smallest problem size. Similarly, processor groups were very effective in improving scalability of a Molecular Dynamics application },
}

@article{Fluet:2008:SFG:1411203.1411239,
  author =	 {Fluet, Matthew and Rainey, Mike and Reppy, John},
  title =	 {A Scheduling Framework for General-purpose Parallel
                  Languages},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {September 2008},
  volume =	 43,
  number =	 9,
  month =	 sep,
  year =	 2008,
  issn =	 {0362-1340},
  pages =	 {241--252},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/1411203.1411239},
  doi =		 {10.1145/1411203.1411239},
  acmid =	 1411239,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {compilers, heterogeneous parallel languages,
                  run-time systems, scheduling},
  abstract =	 {The trend in microprocessor design toward multicore
                  and manycore processors means that future
                  performance gains in software will largely come from
                  harnessing parallelism. To realize such gains, we
                  need languages and implementations that can enable
                  parallelism at many different levels. For example,
                  an application might use both explicit threads to
                  implement course-grain parallelism for independent
                  tasks and implicit threads for fine-grain
                  data-parallel computation over a large array. An
                  important aspect of this requirement is supporting a
                  wide range of different scheduling mechanisms for
                  parallel computation. In this paper, we describe the
                  scheduling framework that we have designed and
                  implemented for Manticore, a strict parallel
                  functional language. We take a micro-kernel approach
                  in our design: the compiler and runtime support a
                  small collection of scheduling primitives upon which
                  complex scheduling policies can be implemented. This
                  framework is extremely flexible and can support a
                  wide range of different scheduling policies. It also
                  supports the nesting of schedulers, which is key to
                  both supporting multiple scheduling policies in the
                  same application and to hierarchies of speculative
                  parallel computations. In addition to describing our
                  framework, we also illustrate its expressiveness
                  with several popular scheduling techniques. We
                  present a (mostly) modular approach to extending our
                  schedulers to support cancellation. This mechanism
                  is essential for implementing eager and speculative
                  parallelism. We finally evaluate our framework with
                  a series of benchmarks and an analysis.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1411239&ftid=551291&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: accepted <2016-01-12 16:32:59>},
}

@inproceedings{Fluet:2008:SFG:1411204.1411239,
  author =	 {Fluet, Matthew and Rainey, Mike and Reppy, John},
  title =	 {A Scheduling Framework for General-purpose Parallel
                  Languages},
  booktitle =	 {Proceedings of the 13th ACM SIGPLAN International
                  Conference on Functional Programming},
  series =	 {ICFP '08},
  year =	 2008,
  isbn =	 {978-1-59593-919-7},
  location =	 {Victoria, BC, Canada},
  pages =	 {241--252},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/1411204.1411239},
  doi =		 {10.1145/1411204.1411239},
  acmid =	 1411239,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {compilers, heterogeneous parallel languages,
                  run-time systems, scheduling},
  abstract =	 {The trend in microprocessor design toward multicore
                  and manycore processors means that future
                  performance gains in software will largely come from
                  harnessing parallelism. To realize such gains, we
                  need languages and implementations that can enable
                  parallelism at many different levels. For example,
                  an application might use both explicit threads to
                  implement course-grain parallelism for independent
                  tasks and implicit threads for fine-grain
                  data-parallel computation over a large array. An
                  important aspect of this requirement is supporting a
                  wide range of different scheduling mechanisms for
                  parallel computation. In this paper, we describe the
                  scheduling framework that we have designed and
                  implemented for Manticore, a strict parallel
                  functional language. We take a micro-kernel approach
                  in our design: the compiler and runtime support a
                  small collection of scheduling primitives upon which
                  complex scheduling policies can be implemented. This
                  framework is extremely flexible and can support a
                  wide range of different scheduling policies. It also
                  supports the nesting of schedulers, which is key to
                  both supporting multiple scheduling policies in the
                  same application and to hierarchies of speculative
                  parallel computations. In addition to describing our
                  framework, we also illustrate its expressiveness
                  with several popular scheduling techniques. We
                  present a (mostly) modular approach to extending our
                  schedulers to support cancellation. This mechanism
                  is essential for implementing eager and speculative
                  parallelism. We finally evaluate our framework with
                  a series of benchmarks and an analysis.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1411239&ftid=551291&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: accepted <2016-01-12 16:33:06>},
}

@book{Szymanski:1991:PFL:107214,
  editor =	 {Szymanski, Boleslaw K.},
  title =	 {Parallel Functional Languages and Compilers},
  year =	 1991,
  isbn =	 {0-201-52243-8},
  source =	 {member price \$40.75, order number 704905},
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  notes = 	 {The articles from this publication seem not to be available online. The title seems highly relevant.},
}

@inproceedings{Fumero:2014:CAF:2627373.2627381,
  author =	 {Fumero, Juan Jos{\'e} and Steuwer, Michel and
                  Dubach, Christophe},
  title =	 {A Composable Array Function Interface for
                  Heterogeneous Computing in Java},
  booktitle =	 {Proceedings of ACM SIGPLAN International Workshop on
                  Libraries, Languages, and Compilers for Array
                  Programming},
  series =	 {ARRAY'14},
  year =	 2014,
  isbn =	 {978-1-4503-2937-8},
  location =	 {Edinburgh, United Kingdom},
  pages =	 {44:44--44:49},
  articleno =	 44,
  numpages =	 6,
  url =		 {http://doi.acm.org/10.1145/2627373.2627381},
  doi =		 {10.1145/2627373.2627381},
  acmid =	 2627381,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Array programming, GPGPU, Patterns},
  abstract =	 {Heterogeneous computing has now become mainstream
                  with virtually every desktop machines featuring
                  accelerators such as Graphics Processing Units
                  (GPUs). While heterogeneity offers the promise of
                  high-performance and high-efficiency, it comes at
                  the cost of huge programming difficulties. Languages
                  and interfaces for programming such system tend to
                  be low-level and require expert knowledge of the
                  hardware in order to achieve its potential. A
                  promising approach for programming such
                  heterogeneous systems is the use of array
                  programming. This style of programming relies on
                  well known parallel patterns that can be easily
                  translated into GPU or other accelerator
                  code. However, only little work has been done on
                  integrating such concepts in mainstream languages
                  such as Java. In this work, we propose a new Array
                  Function interface implemented with the new features
                  from Java 8. While similar in spirit to the new
                  Stream API of Java, our API follows a different
                  design based on reusability and composability. We
                  demonstrate that this API can be used to generate
                  OpenCL code for a simple application. We present
                  encouraging preliminary performance results showing
                  the potential of our approach.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2627381&ftid=1503132&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: accepted <2016-01-12 16:34:03>},
}

@inproceedings{Fraguela:2004:HTA:1066650.1066657,
  author =	 {Fraguela, Basilio B. and Guo, Jia and Bikshandi,
                  Ganesh and Garzar\'{a}n, Mar\'{\i}a J. and
                  Alm\'{a}si, Gheorghe and Moreira, Jos{\'e} and
                  Padua, David},
  title =	 {The Hierarchically Tiled Arrays Programming
                  Approach},
  booktitle =	 {Proceedings of the 7th Workshop on Workshop on
                  Languages, Compilers, and Run-time Support for
                  Scalable Systems},
  series =	 {LCR '04},
  year =	 2004,
  location =	 {Houston, Texas, USA},
  pages =	 {1--12},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/1066650.1066657},
  doi =		 {10.1145/1066650.1066657},
  acmid =	 1066657,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {parallel languages},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1066657&ftid=316641&dwn=1&CFID=574773947&CFTOKEN=83431304},
  fullTextFile = {.slirm_cache/Fraguela_2004_The.pdf},
  notes =	 {Distributed memory, and master-slave architecture,
                  not quite relevant.},
  review =	 {fbie: rejected <2016-01-12 16:36:18>},
}

@article{Corbett:1996:VPF:233557.233558,
  author =	 {Corbett, Peter F. and Feitelson, Dror G.},
  title =	 {The Vesta Parallel File System},
  journal =	 {ACM Trans. Comput. Syst.},
  issue_date =	 {Aug. 1996},
  volume =	 14,
  number =	 3,
  month =	 aug,
  year =	 1996,
  issn =	 {0734-2071},
  pages =	 {225--264},
  numpages =	 40,
  url =		 {http://doi.acm.org/10.1145/233557.233558},
  doi =		 {10.1145/233557.233558},
  acmid =	 233558,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {data partitioning, parallel computing, parallel file
                  system},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=233558&ftid=27217&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:36:33>},
}

@inproceedings{Bernecky:2015:AEP:2774959.2774962,
  author =	 {Bernecky, Robert and Scholz, Sven-Bodo},
  title =	 {Abstract Expressionism for Parallel Performance},
  booktitle =	 {Proceedings of the 2Nd ACM SIGPLAN International
                  Workshop on Libraries, Languages, and Compilers for
                  Array Programming},
  series =	 {ARRAY 2015},
  year =	 2015,
  isbn =	 {978-1-4503-3584-3},
  location =	 {Portland, OR, USA},
  pages =	 {54--59},
  numpages =	 6,
  url =		 {http://doi.acm.org/10.1145/2774959.2774962},
  doi =		 {10.1145/2774959.2774962},
  acmid =	 2774962,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {APL, HPC, SAC, algorithms, expressiveness,
                  functional array languages, parallelism,
                  readability},
  abstract =	 { Programming with abstract, mathematical expressions
                  offers benefits including terser programs, easier
                  communication of algorithms, ability to prove
                  theorems about algorithms, increased parallelism,
                  and improved programming productivity. Common belief
                  is that higher levels of abstraction imply a larger
                  semantic gap between the user and computer and,
                  therefore, typically slower execution, whether
                  sequential or parallel. In recent years,
                  domain-specific languages have been shown to close
                  this gap through sophisticated optimizations
                  benefitting from domain-specific knowledge. In this
                  paper, we demonstrate that the semantic gap can also
                  be closed for non-domain-specific functional array
                  languages, without requiring embedding of
                  language-specific semantic knowledge into the
                  compiler tool chain. We present a simple example of
                  APL-style programs, compiled into C-code that
                  outperform equivalent C programs in both sequential
                  and parallel (OpenMP) environments. We offer
                  insights into abstract expressionist programming, by
                  comparing the characteristics and performance of a
                  numerical relaxation benchmark written in C99, C99
                  with OpenMP directives, scheduling code, and
                  pragmas, and in , a functional array language. We
                  compare three algorithmic styles: if/then/else,
                  hand-optimized loop splitting, and an abstract,
                  functional style whose roots lie in APL. We show
                  that the algorithms match or outperform serial C,
                  and that the hand-optimized and abstract styles
                  generate identical code, and so have identical
                  performance. Furthermore, parallel variants also
                  outperform the best OpenMP C variant by up to a
                  third, with no source code modifications. Preserving
                  an algorithm&#039;s abstract expression during
                  optimization opens the door to generation of
                  radically different code for different
                  architectures. [The author list is wrong, but I see
                  no way to correct, despite the fact that EasyChair
                  has the correct author list.] },
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2774962&ftid=1589049&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: accepted <2016-01-12 16:38:31>},
}

@article{Holk:2014:RMM:2714064.2660244,
  author =	 {Holk, Eric and Newton, Ryan and Siek, Jeremy and
                  Lumsdaine, Andrew},
  title =	 {Region-based Memory Management for GPU Programming
                  Languages: Enabling Rich Data Structures on a
                  Spartan Host},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {October 2014},
  volume =	 49,
  number =	 10,
  month =	 oct,
  year =	 2014,
  issn =	 {0362-1340},
  pages =	 {141--155},
  numpages =	 15,
  url =		 {http://doi.acm.org/10.1145/2714064.2660244},
  doi =		 {10.1145/2714064.2660244},
  acmid =	 2660244,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {algebraic data types, compilers, first class
                  procedures, gpu, harlan, implementation, opencl,
                  optimization, parallel programming, performance,
                  recursion},
  abstract =	 {Graphics processing units (GPUs) can effectively
                  accelerate many applications, but their
                  applicability has been largely limited to problems
                  whose solutions can be expressed neatly in terms of
                  linear algebra. Indeed, most GPU programming
                  languages limit the user to simple data structures -
                  typically only multidimensional rectangular arrays
                  of scalar values. Many algorithms are more naturally
                  expressed using higher level language features, such
                  as algebraic data types (ADTs) and first class
                  procedures, yet building these structures in a
                  manner suitable for a GPU remains a challenge. We
                  present a region-based memory management approach
                  that enables rich data structures in Harlan, a
                  language for data parallel computing. Regions enable
                  rich data structures by providing a uniform
                  representation for pointers on both the CPU and GPU
                  and by providing a means of transferring entire data
                  structures between CPU and GPU memory. We
                  demonstrate Harlan's increased expressiveness on
                  several example programs and show that Harlan
                  performs well on more traditional data-parallel
                  problems.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2660244&ftid=1506842&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:38:39>},
}

@inproceedings{Holk:2014:RMM:2660193.2660244,
  author =	 {Holk, Eric and Newton, Ryan and Siek, Jeremy and
                  Lumsdaine, Andrew},
  title =	 {Region-based Memory Management for GPU Programming
                  Languages: Enabling Rich Data Structures on a
                  Spartan Host},
  booktitle =	 {Proceedings of the 2014 ACM International Conference
                  on Object Oriented Programming Systems Languages \&
                  Applications},
  series =	 {OOPSLA '14},
  year =	 2014,
  isbn =	 {978-1-4503-2585-1},
  location =	 {Portland, Oregon, USA},
  pages =	 {141--155},
  numpages =	 15,
  url =		 {http://doi.acm.org/10.1145/2660193.2660244},
  doi =		 {10.1145/2660193.2660244},
  acmid =	 2660244,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {algebraic data types, compilers, first class
                  procedures, gpu, harlan, implementation, opencl,
                  optimization, parallel programming, performance,
                  recursion},
  abstract =	 {Graphics processing units (GPUs) can effectively
                  accelerate many applications, but their
                  applicability has been largely limited to problems
                  whose solutions can be expressed neatly in terms of
                  linear algebra. Indeed, most GPU programming
                  languages limit the user to simple data structures -
                  typically only multidimensional rectangular arrays
                  of scalar values. Many algorithms are more naturally
                  expressed using higher level language features, such
                  as algebraic data types (ADTs) and first class
                  procedures, yet building these structures in a
                  manner suitable for a GPU remains a challenge. We
                  present a region-based memory management approach
                  that enables rich data structures in Harlan, a
                  language for data parallel computing. Regions enable
                  rich data structures by providing a uniform
                  representation for pointers on both the CPU and GPU
                  and by providing a means of transferring entire data
                  structures between CPU and GPU memory. We
                  demonstrate Harlan's increased expressiveness on
                  several example programs and show that Harlan
                  performs well on more traditional data-parallel
                  problems.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2660244&ftid=1506842&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:38:45>},
}

@inproceedings{Walinsky:1990:FPL:91556.91610,
  author =	 {Walinsky, Clifford and Banerjee, Deb},
  title =	 {A Functional Programming Language Compiler for
                  Massively Parallel Computers},
  booktitle =	 {Proceedings of the 1990 ACM Conference on LISP and
                  Functional Programming},
  series =	 {LFP '90},
  year =	 1990,
  isbn =	 {0-89791-368-X},
  location =	 {Nice, France},
  pages =	 {131--138},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/91556.91610},
  doi =		 {10.1145/91556.91610},
  acmid =	 91610,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=91610&ftid=34538&dwn=1&CFID=574773947&CFTOKEN=83431304},
  fullTextFile = {.slirm_cache/Walinsky_1990_A.pdf},
  notes =	 {High-level parallelism in the FP language.},
  review =	 {fbie: accepted <2016-01-12 16:41:13>},
}

@inproceedings{Ross:2015:PPM:2768177.2768183,
  author =	 {Ross, James A. and Richie, David A. and Park, Song
                  J. and Shires, Dale R.},
  title =	 {Parallel Programming Model for the Epiphany
                  Many-Core Coprocessor Using Threaded MPI},
  booktitle =	 {Proceedings of the 3rd International Workshop on
                  Many-core Embedded Systems},
  series =	 {MES '15},
  year =	 2015,
  isbn =	 {978-1-4503-3408-2},
  location =	 {Portland, OR, USA},
  pages =	 {41--47},
  numpages =	 7,
  url =		 {http://doi.acm.org/10.1145/2768177.2768183},
  doi =		 {10.1145/2768177.2768183},
  acmid =	 2768183,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {2D RISC Array, Adapteva Epiphany, Energy Efficiency,
                  MPI, Many-core, NoC, Parallella},
  abstract =	 {The Adapteva Epiphany many-core architecture
                  comprises a 2D tiled mesh Network-on-Chip (NoC) of
                  low-power RISC cores with minimal uncore
                  functionality. It offers high computational energy
                  efficiency for both integer and floating point
                  calculations as well as parallel scalability. Yet
                  despite the interesting architectural features, a
                  compelling programming model has not been presented
                  to date. This paper demonstrates an efficient
                  parallel programming model for the Epiphany
                  architecture based on the Message Passing Interface
                  (MPI) standard. Using MPI exploits the similarities
                  between the Epiphany architecture and a conventional
                  parallel distributed cluster of serial cores. Our
                  approach enables MPI codes to execute on the RISC
                  array processor with little modification and achieve
                  high performance. We report benchmark results for
                  the threaded MPI implementation of four algorithms
                  (dense matrix-matrix multiplication, N-body particle
                  interaction, a five-point 2D stencil update, and 2D
                  FFT) and highlight the importance of fast inter-core
                  communication for the architecture.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2768183&ftid=1601844&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:41:32>},
}

@inproceedings{Henriksen:2014:BCI:2627373.2627388,
  author =	 {Henriksen, Troels and Oancea, Cosmin E.},
  title =	 {Bounds Checking: An Instance of Hybrid Analysis},
  booktitle =	 {Proceedings of ACM SIGPLAN International Workshop on
                  Libraries, Languages, and Compilers for Array
                  Programming},
  series =	 {ARRAY'14},
  year =	 2014,
  isbn =	 {978-1-4503-2937-8},
  location =	 {Edinburgh, United Kingdom},
  pages =	 {88:88--88:94},
  articleno =	 88,
  numpages =	 7,
  url =		 {http://doi.acm.org/10.1145/2627373.2627388},
  doi =		 {10.1145/2627373.2627388},
  acmid =	 2627388,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {autoparallelization, functional language, subscripts
                  bounds checking},
  abstract =	 {This paper presents an analysis for bounds checking
                  of array subscripts that lifts checking assertions
                  to program level under the form of an
                  arbitrarily-complex predicate (inspector), whose
                  runtime evaluation guards the execution of the code
                  of interest. Separating the predicate from the
                  computation makes it more amenable to optimization,
                  and allows it to be split into a cascade of
                  sufficient conditions of increasing complexity that
                  optimizes the common-inspection path. While
                  synthesizing the bounds checking invariant resembles
                  type checking techniques, we rely on compiler
                  simplification and runtime evaluation rather than
                  employing complex inference and annotation systems
                  that might discourage the non-specialist user. We
                  integrate the analysis in the compiler's repertoire
                  of Futhark: a purely-functional core language
                  supporting map-reduce nested parallelism on regular
                  arrays, and show how the high-level language
                  invariants enable a relatively straightforward
                  analysis. Finally, we report a qualitative
                  evaluation of our technique on three real-world
                  applications from the financial domain that
                  indicates that the runtime overhead of predicates is
                  negligible.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2627388&ftid=1503139&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: accepted <2016-01-12 16:42:34>},
}

@inproceedings{Rodrigues:2014:TPS:2555243.2555268,
  author =	 {Rodrigues, Christopher and Jablin, Thomas and
                  Dakkak, Abdul and Hwu, Wen-Mei},
  title =	 {Triolet: A Programming System That Unifies
                  Algorithmic Skeleton Interfaces for High-performance
                  Cluster Computing},
  booktitle =	 {Proceedings of the 19th ACM SIGPLAN Symposium on
                  Principles and Practice of Parallel Programming},
  series =	 {PPoPP '14},
  year =	 2014,
  isbn =	 {978-1-4503-2656-8},
  location =	 {Orlando, Florida, USA},
  pages =	 {247--258},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2555243.2555268},
  doi =		 {10.1145/2555243.2555268},
  acmid =	 2555268,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {algorithmic skeletons, loop fusion, parallel
                  programming},
  abstract =	 {Functional algorithmic skeletons promise a
                  high-level programming interface for
                  distributed-memory clusters that free developers
                  from concerns of task decomposition, scheduling, and
                  communication. Unfortunately, prior distributed
                  functional skeleton frameworks do not deliver
                  performance comparable to that achievable in a
                  low-level distributed programming model such as C
                  with MPI and OpenMP, even when used in concert with
                  high-performance array libraries. There are several
                  causes: they do not take advantage of shared memory
                  on each cluster node; they impose a fixed
                  partitioning strategy on input data; and they have
                  limited ability to fuse loops involving skeletons
                  that produce a variable number of outputs per
                  input. We address these shortcomings in the Triolet
                  programming language through a modular library
                  design that separates concerns of parallelism, loop
                  nesting, and data partitioning. We show how Triolet
                  substantially improves the parallel performance of
                  algorithms involving array traversals and nested,
                  variable-size loops over what is achievable in Eden,
                  a distributed variant of Haskell. We further
                  demonstrate how Triolet can substantially simplify
                  parallel programming relative to C with MPI and
                  OpenMP while achieving 23--10\% of its performance
                  on a 128-core cluster.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2555268&ftid=1429535&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:43:51>},
}

@article{Rodrigues:2014:TPS:2692916.2555268,
  author =	 {Rodrigues, Christopher and Jablin, Thomas and
                  Dakkak, Abdul and Hwu, Wen-Mei},
  title =	 {Triolet: A Programming System That Unifies
                  Algorithmic Skeleton Interfaces for High-performance
                  Cluster Computing},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {August 2014},
  volume =	 49,
  number =	 8,
  month =	 feb,
  year =	 2014,
  issn =	 {0362-1340},
  pages =	 {247--258},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2692916.2555268},
  doi =		 {10.1145/2692916.2555268},
  acmid =	 2555268,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {algorithmic skeletons, loop fusion, parallel
                  programming},
  abstract =	 {Functional algorithmic skeletons promise a
                  high-level programming interface for
                  distributed-memory clusters that free developers
                  from concerns of task decomposition, scheduling, and
                  communication. Unfortunately, prior distributed
                  functional skeleton frameworks do not deliver
                  performance comparable to that achievable in a
                  low-level distributed programming model such as C
                  with MPI and OpenMP, even when used in concert with
                  high-performance array libraries. There are several
                  causes: they do not take advantage of shared memory
                  on each cluster node; they impose a fixed
                  partitioning strategy on input data; and they have
                  limited ability to fuse loops involving skeletons
                  that produce a variable number of outputs per
                  input. We address these shortcomings in the Triolet
                  programming language through a modular library
                  design that separates concerns of parallelism, loop
                  nesting, and data partitioning. We show how Triolet
                  substantially improves the parallel performance of
                  algorithms involving array traversals and nested,
                  variable-size loops over what is achievable in Eden,
                  a distributed variant of Haskell. We further
                  demonstrate how Triolet can substantially simplify
                  parallel programming relative to C with MPI and
                  OpenMP while achieving 23--10\% of its performance
                  on a 128-core cluster.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2555268&ftid=1429535&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:43:55>},
}

@inproceedings{Jones:2005:FVP:1046192.1046207,
  author =	 {Jones, Alex K. and Hoare, Raymond and Kusic, Dara
                  and Fazekas, Joshua and Foster, John},
  title =	 {An FPGA-based VLIW Processor with Custom Hardware
                  Execution},
  booktitle =	 {Proceedings of the 2005 ACM/SIGDA 13th International
                  Symposium on Field-programmable Gate Arrays},
  series =	 {FPGA '05},
  year =	 2005,
  isbn =	 {1-59593-029-9},
  location =	 {Monterey, California, USA},
  pages =	 {107--117},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/1046192.1046207},
  doi =		 {10.1145/1046192.1046207},
  acmid =	 1046207,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {NIOS, VLIW, compiler, kernels, parallelism,
                  synthesis},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1046207&ftid=303790&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:44:07>},
}

@inproceedings{Haveraaen:2013:HDP:2532352.2532358,
  author =	 {Haveraaen, Magne and Morris, Karla and Rouson,
                  Damian},
  title =	 {High-performance Design Patterns for Modern Fortran},
  booktitle =	 {Proceedings of the 1st International Workshop on
                  Software Engineering for High Performance Computing
                  in Computational Science and Engineering},
  series =	 {SE-HPCCSE '13},
  year =	 2013,
  isbn =	 {978-1-4503-2499-1},
  location =	 {Denver, Colorado},
  pages =	 {1--8},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/2532352.2532358},
  doi =		 {10.1145/2532352.2532358},
  acmid =	 2532358,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Fortran, co-arrays, compute globally -- return
                  locally, coordinate-free programming, high
                  performance computing, scientific computing},
  abstract =	 {High-performance software, as all software,
                  continuously evolves. Besides the normal changes in
                  user requirements, e.g., the wish to solve a
                  variation of a scientific problem, such software is
                  also challenged by a flood of new technologies that
                  promise higher performance and better energy
                  utilization. Continuously adapting HPC codes for
                  multicore processors and many-core accelerators
                  while also adapting to new user requirements,
                  however, drains human resources and prevents
                  utilization of more cost-effective hardware. Here we
                  present some ideas for dealing with software
                  variability in the PDE domain, namely the use of
                  coordinate-free numerics for achieving
                  flexibility. We also show how Fortran, over the last
                  few decades, has changed to become a language well
                  suited for state-of-the-art software
                  development. Fortran's new coarray distributed data
                  structure, the language's class mechanism and
                  side-effect-free, pure function capability provide
                  the scaffolding on which we implement high
                  performance software. These features empower
                  compilers to organize parallel computations with
                  efficient communication. We present some programming
                  patterns that support asynchronous evaluation of
                  expressions comprised of parallel operations on
                  distributed data.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2532358&ftid=1411584&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:45:22>},
}

@inproceedings{Wallace:1988:DMA:55364.55405,
  author =	 {Wallace, D. R.},
  title =	 {Dependence of Multi-dimensional Array References},
  booktitle =	 {Proceedings of the 2Nd International Conference on
                  Supercomputing},
  series =	 {ICS '88},
  year =	 1988,
  isbn =	 {0-89791-272-1},
  location =	 {St. Malo, France},
  pages =	 {418--428},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/55364.55405},
  doi =		 {10.1145/55364.55405},
  acmid =	 55405,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  review =	 {fbie: rejected <2016-01-12 16:45:35>},
}

@inproceedings{Aguston:2014:PHV:2555243.2555275,
  author =	 {Aguston, Cfir and Ben Asher, Yosi and Haber, Gadi},
  title =	 {Parallelization Hints via Code Skeletonization},
  booktitle =	 {Proceedings of the 19th ACM SIGPLAN Symposium on
                  Principles and Practice of Parallel Programming},
  series =	 {PPoPP '14},
  year =	 2014,
  isbn =	 {978-1-4503-2656-8},
  location =	 {Orlando, Florida, USA},
  pages =	 {373--374},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/2555243.2555275},
  doi =		 {10.1145/2555243.2555275},
  acmid =	 2555275,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {parallelization, skeletonization, vectorization},
  abstract =	 {Tools that provide optimization hints for program
                  developers are facing severe obstacles and often
                  unable to provide meaningful guidance on how to
                  parallelize real--life applications. The main reason
                  is due to the high code complexity and its large
                  size when considering commercially valuable
                  code. Such code is often rich with pointers, heavily
                  nested conditional statements, nested while--based
                  loops, function calls, etc. These constructs prevent
                  existing compiler analysis from extracting the full
                  parallelization potential. We propose a new paradigm
                  to overcome this issue by automatically transforming
                  the code into a much simpler skeleton-like form that
                  is more conductive for auto-parallelization. We then
                  apply existing tools of source--level automatic
                  parallelization on the skeletonized code in order to
                  expose possible parallelization patterns. The
                  skeleton code, along with the parallelized version,
                  are then provided to the programmer in the form of
                  an IDE (Integrated Development Environment)
                  recommendation. The proposed skeletonization
                  algorithm replaces pointers by integer indexes and
                  C-struct references by references to
                  multi-dimensional arrays. This is because automatic
                  parallelizers cannot handle pointer expressions. For
                  example, while(p != NULL){ p-&#62;val++;
                  p=p-&#62;next; } will be skeletonized to the
                  parallelizable for(Ip=0;Ip<N;Ip++){ Aval[Ip]++; }
                  where Aval[] holds the embedding of the original
                  list. It follows that the main goal of the
                  skeletonization process is to embed pointer-based
                  data structures into arrays. Though the skeletonized
                  code is not semantically equivalent to the original
                  code, it points out a possible parallelization
                  pattern for this code segment and can be used as an
                  effective parallelization hint to the programmer. We
                  applied the method on several representative
                  benchmarks from SPEC CPU 2000 and reached up to 8\%
                  performance gain after several sequential code
                  segments had been manually parallelized based on the
                  parallelization patterns of the generated
                  skeletons. In a different set of experiments we
                  tried to estimate the potential of skeletonization
                  for a larger set of programs in SPEC 2000 and
                  obtained an estimation of 2\% additional loops that
                  can be parallelized/vectorized due to
                  skeletonization.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2555275&ftid=1429547&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:46:55>},
}

@article{Aguston:2014:PHV:2692916.2555275,
  author =	 {Aguston, Cfir and Ben Asher, Yosi and Haber, Gadi},
  title =	 {Parallelization Hints via Code Skeletonization},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {August 2014},
  volume =	 49,
  number =	 8,
  month =	 feb,
  year =	 2014,
  issn =	 {0362-1340},
  pages =	 {373--374},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/2692916.2555275},
  doi =		 {10.1145/2692916.2555275},
  acmid =	 2555275,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {parallelization, skeletonization, vectorization},
  abstract =	 {Tools that provide optimization hints for program
                  developers are facing severe obstacles and often
                  unable to provide meaningful guidance on how to
                  parallelize real--life applications. The main reason
                  is due to the high code complexity and its large
                  size when considering commercially valuable
                  code. Such code is often rich with pointers, heavily
                  nested conditional statements, nested while--based
                  loops, function calls, etc. These constructs prevent
                  existing compiler analysis from extracting the full
                  parallelization potential. We propose a new paradigm
                  to overcome this issue by automatically transforming
                  the code into a much simpler skeleton-like form that
                  is more conductive for auto-parallelization. We then
                  apply existing tools of source--level automatic
                  parallelization on the skeletonized code in order to
                  expose possible parallelization patterns. The
                  skeleton code, along with the parallelized version,
                  are then provided to the programmer in the form of
                  an IDE (Integrated Development Environment)
                  recommendation. The proposed skeletonization
                  algorithm replaces pointers by integer indexes and
                  C-struct references by references to
                  multi-dimensional arrays. This is because automatic
                  parallelizers cannot handle pointer expressions. For
                  example, while(p != NULL){ p-&#62;val++;
                  p=p-&#62;next; } will be skeletonized to the
                  parallelizable for(Ip=0;Ip<N;Ip++){ Aval[Ip]++; }
                  where Aval[] holds the embedding of the original
                  list. It follows that the main goal of the
                  skeletonization process is to embed pointer-based
                  data structures into arrays. Though the skeletonized
                  code is not semantically equivalent to the original
                  code, it points out a possible parallelization
                  pattern for this code segment and can be used as an
                  effective parallelization hint to the programmer. We
                  applied the method on several representative
                  benchmarks from SPEC CPU 2000 and reached up to 8\%
                  performance gain after several sequential code
                  segments had been manually parallelized based on the
                  parallelization patterns of the generated
                  skeletons. In a different set of experiments we
                  tried to estimate the potential of skeletonization
                  for a larger set of programs in SPEC 2000 and
                  obtained an estimation of 2\% additional loops that
                  can be parallelized/vectorized due to
                  skeletonization.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2555275&ftid=1429547&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:46:59>},
}

@inproceedings{Oancea:2012:FSG:2364474.2364484,
  author =	 {Oancea, Cosmin E. and Andreetta, Christian and
                  Berthold, Jost and Frisch, Alain and Henglein,
                  Fritz},
  title =	 {Financial Software on GPUs: Between Haskell and
                  Fortran},
  booktitle =	 {Proceedings of the 1st ACM SIGPLAN Workshop on
                  Functional High-performance Computing},
  series =	 {FHPC '12},
  year =	 2012,
  isbn =	 {978-1-4503-1577-7},
  location =	 {Copenhagen, Denmark},
  pages =	 {61--72},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2364474.2364484},
  doi =		 {10.1145/2364474.2364484},
  acmid =	 2364484,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {autoparallelization, functional language, memory
                  coalescing, strength reduction, tiling},
  abstract =	 {This paper presents a real-world pricing kernel for
                  financial derivatives and evaluates the language and
                  compiler tool chain that would allow expressive,
                  hardware-neutral algorithm implementation and
                  efficient execution on graphics-processing units
                  (GPU). The language issues refer to preserving
                  algorithmic invariants, e.g., inherent parallelism
                  made explicit by map-reduce-scan functional
                  combinators. Efficient execution is achieved by
                  manually; applying a series of generally-applicable
                  compiler transformations that allows the
                  generated-OpenCL code to yield speedups as high as
                  70x and 540x on a commodity mobile and desktop GPU,
                  respectively. Apart from the concrete speed-ups
                  attained, our contributions are twofold: First, from
                  a language perspective;, we illustrate that even
                  state-of-the-art auto-parallelization techniques are
                  incapable of discovering all the requisite data
                  parallelism when rendering the functional code in
                  Fortran-style imperative array processing
                  form. Second, from a performance perspective;, we
                  study which compiler transformations are necessary
                  to map the high-level functional code to
                  hand-optimized OpenCL code for GPU execution. We
                  discover a rich optimization space with nontrivial
                  trade-offs and cost models. Memory reuse in
                  map-reduce patterns, strength reduction, branch
                  divergence optimization, and memory access
                  coalescing, exhibit significant impact
                  individually. When combined, they enable essentially
                  full utilization of all GPU cores. Functional
                  programming has played a crucial double role in our
                  case study: Capturing the naturally data-parallel
                  structure of the pricing algorithm in a transparent,
                  reusable and entirely hardware-independent fashion;
                  and supporting the correctness of the subsequent
                  compiler transformations to a hardware-oriented
                  target language by a rich class of universally valid
                  equational properties. Given the observed difficulty
                  of automatically parallelizing imperative sequential
                  code and the inherent labor of porting
                  hardware-oriented and -optimized programs, our case
                  study suggests that functional programming
                  technology can facilitate high-level; expression of
                  leading-edge performant portable; high-performance
                  systems for massively parallel hardware
                  architectures.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2364484&ftid=1282821&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:48:35>},
}

@inproceedings{Geuns:2014:TAM:2609248.2609252,
  author =	 {Geuns, Stefan J. and Hausmans, Joost P. H. M. and
                  Bekooij, Marco J. G.},
  title =	 {Temporal Analysis Model Extraction for Optimizing
                  Modal Multi-rate Stream Processing Applications},
  booktitle =	 {Proceedings of the 17th International Workshop on
                  Software and Compilers for Embedded Systems},
  series =	 {SCOPES '14},
  year =	 2014,
  isbn =	 {978-1-4503-2941-5},
  location =	 {Sankt Goar, Germany},
  pages =	 {21--30},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/2609248.2609252},
  doi =		 {10.1145/2609248.2609252},
  acmid =	 2609252,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {automatic parallelization, dataflow, real-time},
  abstract =	 {Modern real-time stream processing applications,
                  such as Software Defined Radio (SDR) applications,
                  typically have multiple modes and multi-rate
                  behavior. Modes are often described using
                  while-loops whereas multi-rate behavior is
                  frequently described using arrays with pseudo-random
                  indexing patterns. The temporal properties of these
                  applications have to be analyzed in order to
                  determine whether optimizations improve
                  throughput. However, no method exists in which a
                  temporal analysis model is derived from these
                  applications that is suitable for temporal analysis
                  and optimization. In this paper an approach is
                  presented in which a concurrency model for the
                  temporal analysis and optimization of stream
                  processing applications is automatically extracted
                  from a parallelized sequential application. With
                  this model it can be determined whether a program
                  transformation improves the worst-case temporal
                  behavior. The key feature of the presented approach
                  is that arrays with arbitrary indexing patterns can
                  be described, allowing the description of multi-rate
                  behavior, while still supporting the description of
                  modes using while-loops. In the model, an
                  over-approximation of the synchronization
                  dependencies is used in case of arrays with
                  pseudo-random indexing patterns. Despite the use of
                  this approximation, we show that deadlock is only
                  concluded from the model if there is also deadlock
                  in the parallelized application. The relevance and
                  applicability of the presented approach are
                  demonstrated using an Orthogonal Frequency-Division
                  Multiplexing (OFDM) transmitter application.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2609252&ftid=1469668&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:49:46>},
}

@inproceedings{Pham:2015:GDI:2791347.2791363,
  author =	 {Pham, Quan and Malik, Tanu},
  title =	 {GEN: A Database Interface Generator for HPC
                  Programs},
  booktitle =	 {Proceedings of the 27th International Conference on
                  Scientific and Statistical Database Management},
  series =	 {SSDBM '15},
  year =	 2015,
  isbn =	 {978-1-4503-3709-0},
  location =	 {La Jolla, California},
  pages =	 {40:1--40:5},
  articleno =	 40,
  numpages =	 5,
  url =		 {http://doi.acm.org/10.1145/2791347.2791363},
  doi =		 {10.1145/2791347.2791363},
  acmid =	 2791363,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {HPC systems, data organization, data-intensive
                  computing, database systems, parallel programs,
                  scientific data management},
  abstract =	 {In this paper, we present GEN an interface generator
                  that takes user-supplied C declarations and provides
                  the necessary interface needed to load and access
                  data from common scientific array databases such as
                  SciDB and Rasdaman. GEN can be used for storing the
                  output of parallel computations directly into the
                  database and automates the previously used
                  inefficient ingestion process which requires
                  development of special database schemas for each
                  computation. Further, GEN requires no modifications
                  to existing C code and can build a working interface
                  in minutes. We show how GEN can be used for
                  Cosmology analysis programs to output data sets in
                  real-time to a database and use for subsequent
                  analysis. We show that GEN introduces modest
                  overhead in program execution but is more efficient
                  than writing to files and then loading. More
                  significantly, it significantly reduces the
                  programmatic overhead of learning new database
                  languages.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2791363&ftid=1601941&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: rejected <2016-01-12 16:50:21>},
}

@article{Hwang:1995:AOS:209937.209949,
  author =	 {Hwang, Gwan-Hwan and Lee, Jenq Kuen and Ju,
                  Dz-Ching},
  title =	 {An Array Operation Synthesis Scheme to Optimize
                  Fortran 90 Programs},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {Aug. 1995},
  volume =	 30,
  number =	 8,
  month =	 aug,
  year =	 1995,
  issn =	 {0362-1340},
  pages =	 {112--122},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/209937.209949},
  doi =		 {10.1145/209937.209949},
  acmid =	 209949,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=209949&ftid=47044&dwn=1&CFID=574773947&CFTOKEN=83431304},
  fullTextFile = {.slirm_cache/Hwang_1995_An.pdf},
  notes =	 {Optimizing array accesses in Fortran.},
  review =	 {fbie: accepted <2016-01-12 16:52:18>},
}

@inproceedings{Hwang:1995:AOS:209936.209949,
  author =	 {Hwang, Gwan-Hwan and Lee, Jenq Kuen and Ju,
                  Dz-Ching},
  title =	 {An Array Operation Synthesis Scheme to Optimize
                  Fortran 90 Programs},
  booktitle =	 {Proceedings of the Fifth ACM SIGPLAN Symposium on
                  Principles and Practice of Parallel Programming},
  series =	 {PPOPP '95},
  year =	 1995,
  isbn =	 {0-89791-700-6},
  location =	 {Santa Barbara, California, USA},
  pages =	 {112--122},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/209936.209949},
  doi =		 {10.1145/209936.209949},
  acmid =	 209949,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=209949&ftid=47044&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: accepted <2016-01-12 16:52:24>},
}

@article{Stucki:2015:RVP:2858949.2784739,
  author =	 {Stucki, Nicolas and Rompf, Tiark and Ureche, Vlad
                  and Bagwell, Phil},
  title =	 {RRB Vector: A Practical General Purpose Immutable
                  Sequence},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {September 2015},
  volume =	 50,
  number =	 9,
  month =	 aug,
  year =	 2015,
  issn =	 {0362-1340},
  pages =	 {342--354},
  numpages =	 13,
  url =		 {http://doi.acm.org/10.1145/2858949.2784739},
  doi =		 {10.1145/2858949.2784739},
  acmid =	 2784739,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Arrays, Data Structures, Immutable, Radix-Balanced,
                  Relaxed-Radix-Balanced, Sequences, Trees, Vectors},
  abstract =	 { State-of-the-art immutable collections have wildly
                  differing performance characteristics across their
                  operations, often forcing programmers to choose
                  different collection implementations for each
                  task. Thus, changes to the program can invalidate
                  the choice of collections, making code evolution
                  costly. It would be desirable to have a collection
                  that performs well for a broad range of
                  operations. To this end, we present the RRB-Vector,
                  an immutable sequence collection that offers good
                  performance across a large number of sequential and
                  parallel operations. The underlying innovations are:
                  (1) the Relaxed-Radix-Balanced (RRB) tree structure,
                  which allows efficient structural reorganization,
                  and (2) an optimization that exploits
                  spatio-temporal locality on the RRB data structure
                  in order to offset the cost of traversing the
                  tree. In our benchmarks, the RRB-Vector speedup for
                  parallel operations is lower bounded by 7x when
                  executing on 4 CPUs of 8 cores each. The performance
                  for discrete operations, such as appending on either
                  end, or updating and removing elements, is
                  consistently good and compares favorably to the most
                  important immutable sequence collections in the
                  literature and in use today. The memory footprint of
                  RRB-Vector is on par with arrays and an order of
                  magnitude less than competing collections. },
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2784739&ftid=1616034&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: accepted <2016-01-12 16:53:40>},
}

@inproceedings{Stucki:2015:RVP:2784731.2784739,
  author =	 {Stucki, Nicolas and Rompf, Tiark and Ureche, Vlad
                  and Bagwell, Phil},
  title =	 {RRB Vector: A Practical General Purpose Immutable
                  Sequence},
  booktitle =	 {Proceedings of the 20th ACM SIGPLAN International
                  Conference on Functional Programming},
  series =	 {ICFP 2015},
  year =	 2015,
  isbn =	 {978-1-4503-3669-7},
  location =	 {Vancouver, BC, Canada},
  pages =	 {342--354},
  numpages =	 13,
  url =		 {http://doi.acm.org/10.1145/2784731.2784739},
  doi =		 {10.1145/2784731.2784739},
  acmid =	 2784739,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Arrays, Data Structures, Immutable, Radix-Balanced,
                  Relaxed-Radix-Balanced, Sequences, Trees, Vectors},
  abstract =	 { State-of-the-art immutable collections have wildly
                  differing performance characteristics across their
                  operations, often forcing programmers to choose
                  different collection implementations for each
                  task. Thus, changes to the program can invalidate
                  the choice of collections, making code evolution
                  costly. It would be desirable to have a collection
                  that performs well for a broad range of
                  operations. To this end, we present the RRB-Vector,
                  an immutable sequence collection that offers good
                  performance across a large number of sequential and
                  parallel operations. The underlying innovations are:
                  (1) the Relaxed-Radix-Balanced (RRB) tree structure,
                  which allows efficient structural reorganization,
                  and (2) an optimization that exploits
                  spatio-temporal locality on the RRB data structure
                  in order to offset the cost of traversing the
                  tree. In our benchmarks, the RRB-Vector speedup for
                  parallel operations is lower bounded by 7x when
                  executing on 4 CPUs of 8 cores each. The performance
                  for discrete operations, such as appending on either
                  end, or updating and removing elements, is
                  consistently good and compares favorably to the most
                  important immutable sequence collections in the
                  literature and in use today. The memory footprint of
                  RRB-Vector is on par with arrays and an order of
                  magnitude less than competing collections. },
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2784739&ftid=1616034&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: accepted <2016-01-12 16:53:43>},
}

@inproceedings{Hannig:2002:EEN:564870.564895,
  author =	 {Hannig, Frank and Teich, J\"{u}rgen},
  title =	 {Energy Estimation of Nested Loop Programs},
  booktitle =	 {Proceedings of the Fourteenth Annual ACM Symposium
                  on Parallel Algorithms and Architectures},
  series =	 {SPAA '02},
  year =	 2002,
  isbn =	 {1-58113-529-7},
  location =	 {Winnipeg, Manitoba, Canada},
  pages =	 {149--150},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/564870.564895},
  doi =		 {10.1145/564870.564895},
  acmid =	 564895,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {hardware mapping, low power, processor arrays},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=564895&ftid=86359&dwn=1&CFID=574773947&CFTOKEN=83431304},
  fullTextFile = {.slirm_cache/Hannig_2002_Energy.pdf},
  review =	 {fbie: rejected <2016-01-12 16:54:41>},
}

@inproceedings{Henriksen:2013:TGA:2502323.2502328,
  author =	 {Henriksen, Troels and Oancea, Cosmin Eugen},
  title =	 {A T2 Graph-reduction Approach to Fusion},
  booktitle =	 {Proceedings of the 2Nd ACM SIGPLAN Workshop on
                  Functional High-performance Computing},
  series =	 {FHPC '13},
  year =	 2013,
  isbn =	 {978-1-4503-2381-9},
  location =	 {Boston, Massachusetts, USA},
  pages =	 {47--58},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2502323.2502328},
  doi =		 {10.1145/2502323.2502328},
  acmid =	 2502328,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {autoparallelization, functional language, fusion},
  abstract =	 {Fusion is one of the most important code
                  transformations as it has the potential to
                  substantially optimize both the memory hierarchy
                  time overhead and, sometimes asymptotically, the
                  space requirement. In functional languages, fusion
                  is naturally and relatively easily derived as a
                  producer-consumer relation between program
                  constructs that expose a richer, higher-order
                  algebra of program invariants, such as the
                  map-reduce list homomorphisms. In imperative
                  languages, fusing producer-consumer loops requires
                  dependency analysis on arrays applied at loop-nest
                  level. Such analysis, however, has often been
                  labeled as "heroic effort" and, if at all, is
                  supported only in its simplest and most conservative
                  form in industrial compilers. Related
                  implementations in the functional context typically
                  apply fusion only when the to-be-fused producer is
                  used exactly once, i.e., in the consumer. This
                  guarantees that the transformation is conservative:
                  the resulting program does not duplicate
                  computation. We show that the above restriction is
                  more conservative than needed, and present a
                  structural-analysis technique, inspired from the
                  T1--T2 transformation for reducible data flow, that
                  enables fusion even in some cases when the producer
                  is used in different consumers and without
                  duplicating computation. We report an implementation
                  of the fusion algorithm for a functional-core
                  language, named L0, which is intended to support
                  nested parallelism across regular multi-dimensional
                  arrays. We succinctly describe L0's semantics and
                  the compiler infrastructure on which the fusion
                  transformation relies, and present
                  compiler-generated statistics related to fusion on a
                  set of six benchmarks.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=2502328&ftid=1397476&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review =	 {fbie: accepted <2016-01-12 16:55:42>},
}

@inproceedings{Reddi:1978:MCP:800127.804072,
  author =	 {Reddi, S. S.},
  title =	 {A Modular Computer With Petri Net Array Control},
  booktitle =	 {Proceedings of the 1978 Annual Conference},
  series =	 {ACM '78},
  year =	 1978,
  isbn =	 {0-89791-000-1},
  location =	 {Washington, D.C., USA},
  pages =	 {79--85},
  numpages =	 7,
  url =		 {http://doi.acm.org/10.1145/800127.804072},
  doi =		 {10.1145/800127.804072},
  acmid =	 804072,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Modular computers, Parallel computer architecture,
                  Petri nets, Reconfigurable computers},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=804072&ftid=59880&dwn=1&CFID=574773947&CFTOKEN=83431304},
  fullTextFile = {.slirm_cache/Reddi_1978_A.pdf},
  review =	 {fbie: rejected <2016-01-12 16:56:32>},
}

@inproceedings{Baskaran:2008:ADM:1345206.1345210,
  author =	 {Baskaran, Muthu Manikandan and Bondhugula, Uday and
                  Krishnamoorthy, Sriram and Ramanujam, J. and
                  Rountev, Atanas and Sadayappan, P.},
  title =	 {Automatic Data Movement and Computation Mapping for
                  Multi-level Parallel Architectures with Explicitly
                  Managed Memories},
  booktitle =	 {Proceedings of the 13th ACM SIGPLAN Symposium on
                  Principles and Practice of Parallel Programming},
  series =	 {PPoPP '08},
  year =	 2008,
  isbn =	 {978-1-59593-795-7},
  location =	 {Salt Lake City, UT, USA},
  pages =	 {1--10},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/1345206.1345210},
  doi =		 {10.1145/1345206.1345210},
  acmid =	 1345210,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {data movement, graphics processor unit, multi-level
                  tiling, scratchpad memory},
  abstract =	 {Several parallel architectures such as GPUs and the
                  Cell processor have fast explicitly managed on-chip
                  memories, in addition to slow off-chip memory. They
                  also have very high computational power with
                  multiple levels of parallelism. A significant
                  challenge in programming these architectures is to
                  effectively exploit the parallelism available in the
                  architecture and manage the fast memories to
                  maximize performance. In this paper we develop an
                  approach to effective automatic data management for
                  on-chip memories, including creation of buffers in
                  on-chip (local) memories for holding portions of
                  data accessed in a computational block, automatic
                  determination of array access functions of local
                  buffer references, and generation of code that moves
                  data between slow off-chip memory and fast local
                  memories. We also address the problem of mapping
                  computation in regular programs to multi-level
                  parallel architectures using a multi-level tiling
                  approach, and study the impact of on-chip memory
                  availability on the selection of tile sizes at
                  various levels. Experimental results on a GPU
                  demonstrate the effectiveness of the proposed
                  approach.},
  fullTextUrl =
                  {http://dl.acm.org/ft_gateway.cfm?id=1345210&ftid=506004&dwn=1&CFID=574773947&CFTOKEN=83431304},
  review = 	 {fbie: rejected <2016-01-13 10:10:13>},
}

@article{Pugh:1998:CAD:291889.291900,
  author =	 {Pugh, William and Wonnacott, David},
  title =	 {Constraint-based Array Dependence Analysis},
  journal =	 {ACM Trans. Program. Lang. Syst.},
  issue_date =	 {May 1998},
  volume =	 20,
  number =	 3,
  month =	 may,
  year =	 1998,
  issn =	 {0164-0925},
  pages =	 {635--678},
  numpages =	 44,
  url =		 {http://doi.acm.org/10.1145/291889.291900},
  doi =		 {10.1145/291889.291900},
  acmid =	 291900,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Presburger Arithmetic, array dataflow analysis,
                  dependence abstraction, dependence analysis,
                  parallelization, static analysis},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=291900&ftid=32245&dwn=1&CFID=574965416&CFTOKEN=24274792},
  fullTextFile = {.slirm_cache/Pugh_1998_Constraint.pdf},
  notes = 	 {},
  review = 	 {fbie: accepted <2016-01-13 11:08:21>},
}

@inproceedings{Syme:2006:LNM:1159876.1159884,
  author =	 {Syme, Don},
  title =	 {Leveraging .NET Meta-programming Components from
                  F\#: Integrated Queries and Interoperable
                  Heterogeneous Execution},
  booktitle =	 {Proceedings of the 2006 Workshop on ML},
  series =	 {ML '06},
  year =	 2006,
  isbn =	 {1-59593-483-9},
  location =	 {Portland, Oregon, USA},
  pages =	 {43--54},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/1159876.1159884},
  doi =		 {10.1145/1159876.1159884},
  acmid =	 1159884,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {GPUs, LINQ, database languages, domain specific
                  languages, functional programming, meta-programming,
                  reflection},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=1159884&ftid=374303&dwn=1&CFID=574965416&CFTOKEN=24274792},
  fullTextFile = {.slirm_cache/Syme_2006_Leveraging.pdf},
  notes = 	 {This paper is more focused of in-language DSLs for performing maybe also parallel tasks. Not relevant enough.},
  review = 	 {fbie: rejected <2016-01-13 11:10:06>},
}

@inproceedings{Scott:1995:HHG:201310.201319,
  author =	 {Scott, Stephen D. and Samal, Ashok and Seth, Shared},
  title =	 {HGA: A Hardware-based Genetic Algorithm},
  booktitle =	 {Proceedings of the 1995 ACM Third International
                  Symposium on Field-programmable Gate Arrays},
  series =	 {FPGA '95},
  year =	 1995,
  isbn =	 {0-89791-743-X},
  location =	 {Monterey, California, USA},
  pages =	 {53--59},
  numpages =	 7,
  url =		 {http://doi.acm.org/10.1145/201310.201319},
  doi =		 {10.1145/201310.201319},
  acmid =	 201319,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {field programmable gate arrays, function
                  optimization, parallel genetic algorithms,
                  performance acceleration, performance evaluation},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=201319&ftid=43634&dwn=1&CFID=574965416&CFTOKEN=24274792},
  notes = 	 {Hardware is not relevant.},
  review = 	 {fbie: rejected <2016-01-13 11:10:24>},
}

@inproceedings{Cischke:2013:PIH:2499968.2499970,
  author =	 {Cischke, Christopher (Kit)},
  title =	 {Performance of an Intuitive Hash Table in
                  Shared-memory Parallel Programs},
  booktitle =	 {Proceedings of the High Performance Computing
                  Symposium},
  series =	 {HPC '13},
  year =	 2013,
  isbn =	 {978-1-62748-033-8},
  location =	 {San Diego, California},
  pages =	 {2:1--2:5},
  articleno =	 2,
  numpages =	 5,
  url =		 {http://dl.acm.org/citation.cfm?id=2499968.2499970},
  acmid =	 2499970,
  publisher =	 {Society for Computer Simulation International},
  address =	 {San Diego, CA, USA},
  keywords =	 {PGAS, UPC, hash tables, parallel programming},
  abstract = 	 {Much research has been done and effort expended to design associative arrays or "hash tables" for parallel architectures. These efforts focus on a variety of techniques, including data distribution patterns, data access patterns, tiny changes to the hash function and the buffering of accesses. These efforts are not without merit, but the additional work laid on the programmer or the specialized nature of some of the layouts leads to a very low rate of return. Considering the push for modern parallel programming languages which abstract away the underlying architecture, a more intuitive version of the hash table is appropriate. We discuss the design and implementation of a hash table that is deliberately disconnected from the hardware. Performance of this design is evaluated along with recommendations for future work and usage.},
  review = 	 {fbie: rejected <2016-01-13 10:11:40>},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2499970&ftid=1379826&dwn=1&CFID=574965416&CFTOKEN=24274792},
}

@inproceedings{Chakravarty:2001:FAF:507635.507661,
  author =	 {Chakravarty, Manuel M. T. and Keller, Gabriele},
  title =	 {Functional Array Fusion},
  booktitle =	 {Proceedings of the Sixth ACM SIGPLAN International
                  Conference on Functional Programming},
  series =	 {ICFP '01},
  year =	 2001,
  isbn =	 {1-58113-415-0},
  location =	 {Florence, Italy},
  pages =	 {205--216},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/507635.507661},
  doi =		 {10.1145/507635.507661},
  acmid =	 507661,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=507661&ftid=69665&dwn=1&CFID=574965416&CFTOKEN=24274792},
  fullTextFile = {.slirm_cache/Chakravarty_2001_Functional.pdf},
  notes = 	 {Describes loop fusion in the Haskell compiler, the abstract claims that it is geared towards parallelism.},
  review = 	 {fbie: accepted <2016-01-13 11:25:25>},
}

@article{Chakravarty:2001:FAF:507669.507661,
  author =	 {Chakravarty, Manuel M. T. and Keller, Gabriele},
  title =	 {Functional Array Fusion},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {October 2001},
  volume =	 36,
  number =	 10,
  month =	 oct,
  year =	 2001,
  issn =	 {0362-1340},
  pages =	 {205--216},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/507546.507661},
  doi =		 {10.1145/507546.507661},
  acmid =	 507661,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=507661&ftid=69665&dwn=1&CFID=574974113&CFTOKEN=15072837},
  fullTextFile = {.slirm_cache/Chakravarty_2001_Functional.pdf},
  review = 	 {fbie: accepted <2016-01-13 11:25:44>},
}

@inproceedings{Maydan:1993:AFA:158511.158515,
  author =	 {Maydan, Dror E. and Amarasinghe, Saman P. and Lam,
                  Monica S.},
  title =	 {Array-data Flow Analysis and Its Use in Array
                  Privatization},
  booktitle =	 {Proceedings of the 20th ACM SIGPLAN-SIGACT Symposium
                  on Principles of Programming Languages},
  series =	 {POPL '93},
  year =	 1993,
  isbn =	 {0-89791-560-7},
  location =	 {Charleston, South Carolina, USA},
  pages =	 {2--15},
  numpages =	 14,
  url =		 {http://doi.acm.org/10.1145/158511.158515},
  doi =		 {10.1145/158511.158515},
  acmid =	 158515,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=158515&ftid=33071&dwn=1&CFID=574974113&CFTOKEN=15072837},
  fullTextFile = {.slirm_cache/Maydan_1993_Array.pdf},
  notes = 	 {A efficient algorithm for analyzing array accesses in nested loops via data-flow techniques.},
  review = 	 {fbie: accepted <2016-01-13 11:28:39>},
}

@inproceedings{Ching:1990:APA:97808.97826,
  author =	 {Ching, Wai-Mee},
  title =	 {Automatic Parallelization of APL-style Programs},
  booktitle =	 {Conference Proceedings on APL 90: For the Future},
  series =	 {APL '90},
  year =	 1990,
  isbn =	 {0-89791-371-X},
  location =	 {Copenhagen, Denmark},
  pages =	 {76--80},
  numpages =	 5,
  url =		 {http://doi.acm.org/10.1145/97808.97826},
  doi =		 {10.1145/97808.97826},
  acmid =	 97826,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=97826&ftid=14719&dwn=1&CFID=574974113&CFTOKEN=15072837},
  fullTextFile = {.slirm_cache/Ching_1990_Automatic.pdf},
  review = 	 {fbie: accepted <2016-01-13 11:30:32>},
}

@article{Ching:1990:APA:97811.97826,
  author =	 {Ching, Wai-Mee},
  title =	 {Automatic Parallelization of APL-style Programs},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {July 1990},
  volume =	 20,
  number =	 4,
  month =	 may,
  year =	 1990,
  issn =	 {0163-6006},
  pages =	 {76--80},
  numpages =	 5,
  url =		 {http://doi.acm.org/10.1145/97811.97826},
  doi =		 {10.1145/97811.97826},
  acmid =	 97826,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=97826&ftid=14719&dwn=1&CFID=574974113&CFTOKEN=15072837},
  review = 	 {fbie: accepted <2016-01-13 11:30:39>},
}

@inproceedings{Chong:2014:SCA:2535838.2535882,
  author =	 {Chong, Nathan and Donaldson, Alastair F. and Ketema,
                  Jeroen},
  title =	 {A Sound and Complete Abstraction for Reasoning About
                  Parallel Prefix Sums},
  booktitle =	 {Proceedings of the 41st ACM SIGPLAN-SIGACT Symposium
                  on Principles of Programming Languages},
  series =	 {POPL '14},
  year =	 2014,
  isbn =	 {978-1-4503-2544-8},
  location =	 {San Diego, California, USA},
  pages =	 {397--409},
  numpages =	 13,
  url =		 {http://doi.acm.org/10.1145/2535838.2535882},
  doi =		 {10.1145/2535838.2535882},
  acmid =	 2535882,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {abstraction, formal verification, gpus, parallel
                  prefix sum computation},
  abstract = 	 {Prefix sums are key building blocks in the implementation of many concurrent software applications, and recently much work has gone into efficiently implementing prefix sums to run on massively parallel graphics processing units (GPUs). Because they lie at the heart of many GPU-accelerated applications, the correctness of prefix sum implementations is of prime importance. We introduce a novel abstraction, the interval of summations, that allows scalable reasoning about implementations of prefix sums. We present this abstraction as a monoid, and prove a soundness and completeness result showing that a generic sequential prefix sum implementation is correct for an array of length $n$ if and only if it computes the correct result for a specific test case when instantiated with the interval of summations monoid. This allows correctness to be established by running a single test where the input and result require O(n lg(n)) space. This improves upon an existing result by Sheeran where the input requires O(n lg(n)) space and the result O(n2 \lg(n)) space, and is more feasible for large n than a method by Voigtlaender that uses O(n) space for the input and result but requires running O(n2) tests. We then extend our abstraction and results to the context of data-parallel programs, developing an automated verification method for GPU implementations of prefix sums. Our method uses static verification to prove that a generic prefix sum implementation is data race-free, after which functional correctness of the implementation can be determined by running a single test case under the interval of summations abstraction. We present an experimental evaluation using four different prefix sum algorithms, showing that our method is highly automatic, scales to large thread counts, and significantly outperforms Voigtlaender's method when applied to large arrays.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2535882&ftid=1424874&dwn=1&CFID=574974113&CFTOKEN=15072837},
  review = 	 {fbie: rejected <2016-01-13 11:31:50>},
}

@article{Chong:2014:SCA:2578855.2535882,
  author =	 {Chong, Nathan and Donaldson, Alastair F. and Ketema,
                  Jeroen},
  title =	 {A Sound and Complete Abstraction for Reasoning About
                  Parallel Prefix Sums},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {January 2014},
  volume =	 49,
  number =	 1,
  month =	 jan,
  year =	 2014,
  issn =	 {0362-1340},
  pages =	 {397--409},
  numpages =	 13,
  url =		 {http://doi.acm.org/10.1145/2578855.2535882},
  doi =		 {10.1145/2578855.2535882},
  acmid =	 2535882,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {abstraction, formal verification, gpus, parallel
                  prefix sum computation},
  abstract = 	 {Prefix sums are key building blocks in the implementation of many concurrent software applications, and recently much work has gone into efficiently implementing prefix sums to run on massively parallel graphics processing units (GPUs). Because they lie at the heart of many GPU-accelerated applications, the correctness of prefix sum implementations is of prime importance. We introduce a novel abstraction, the interval of summations, that allows scalable reasoning about implementations of prefix sums. We present this abstraction as a monoid, and prove a soundness and completeness result showing that a generic sequential prefix sum implementation is correct for an array of length $n$ if and only if it computes the correct result for a specific test case when instantiated with the interval of summations monoid. This allows correctness to be established by running a single test where the input and result require O(n lg(n)) space. This improves upon an existing result by Sheeran where the input requires O(n lg(n)) space and the result O(n2 \lg(n)) space, and is more feasible for large n than a method by Voigtlaender that uses O(n) space for the input and result but requires running O(n2) tests. We then extend our abstraction and results to the context of data-parallel programs, developing an automated verification method for GPU implementations of prefix sums. Our method uses static verification to prove that a generic prefix sum implementation is data race-free, after which functional correctness of the implementation can be determined by running a single test case under the interval of summations abstraction. We present an experimental evaluation using four different prefix sum algorithms, showing that our method is highly automatic, scales to large thread counts, and significantly outperforms Voigtlaender's method when applied to large arrays.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2535882&ftid=1424874&dwn=1&CFID=574974113&CFTOKEN=15072837},
  review = 	 {fbie: rejected <2016-01-13 11:31:55>},
}

@article{Resnick:1975:DCE:390015.808402,
  author =	 {Resnick, Harold K. and Larson, Arvid G.},
  title =	 {DMAP-a COBOL Extension for Associative Array
                  Processors},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {March 1975},
  volume =	 10,
  number =	 3,
  month =	 jan,
  year =	 1975,
  issn =	 {0362-1340},
  pages =	 {54--61},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/390015.808402},
  doi =		 {10.1145/390015.808402},
  acmid =	 808402,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Associative array processor, Data management, High
                  level language},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=808402&ftid=150144&dwn=1&CFID=574974113&CFTOKEN=15072837},
  fullTextFile = {.slirm_cache/Resnick_1975_DMAP.pdf},
  notes = 	 {Very old extended abstract. Not very informative.},
  review = 	 {fbie: rejected <2016-01-13 11:35:13>},
}

@inproceedings{Resnick:1975:DCE:800026.808402,
  author =	 {Resnick, Harold K. and Larson, Arvid G.},
  title =	 {DMAP-a COBOL Extension for Associative Array
                  Processors},
  booktitle =	 {Proceedings of the Conference on Programming
                  Languages and Compilers for Parallel and Vector
                  Machines},
  year =	 1975,
  location =	 {New York, New York},
  pages =	 {54--61},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/800026.808402},
  doi =		 {10.1145/800026.808402},
  acmid =	 808402,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Associative array processor, Data management, High
                  level language},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=808402&ftid=150144&dwn=1&CFID=574974113&CFTOKEN=15072837},
  review = 	 {fbie: rejected <2016-01-13 11:35:36>},
}

@inproceedings{Lopez:2015:PVM:2814270.2814302,
  author =	 {L\'{o}pez, Hugo A. and Marques, Eduardo R. B. and
                  Martins, Francisco and Ng, Nicholas and Santos,
                  C{\'e}sar and Vasconcelos, Vasco Thudichum and
                  Yoshida, Nobuko},
  title =	 {Protocol-based Verification of Message-passing
                  Parallel Programs},
  booktitle =	 {Proceedings of the 2015 ACM SIGPLAN International
                  Conference on Object-Oriented Programming, Systems,
                  Languages, and Applications},
  series =	 {OOPSLA 2015},
  year =	 2015,
  isbn =	 {978-1-4503-3689-5},
  location =	 {Pittsburgh, PA, USA},
  pages =	 {280--298},
  numpages =	 19,
  url =		 {http://doi.acm.org/10.1145/2814270.2814302},
  doi =		 {10.1145/2814270.2814302},
  acmid =	 2814302,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Dependent types, MPI, Parallel programming, Program
                  verification, Session types},
  abstract = 	 { We present ParTypes, a type-based methodology for the verification of Message Passing Interface (MPI) programs written in the C programming language. The aim is to statically verify programs against protocol specifications, enforcing properties such as fidelity and absence of deadlocks. We develop a protocol language based on a dependent type system for message-passing parallel programs, which includes various communication operators, such as point-to-point messages, broadcast, reduce, array scatter and gather. For the verification of a program against a given protocol, the protocol is first translated into a representation read by VCC, a software verifier for C. We successfully verified several MPI programs in a running time that is independent of the number of processes or other input parameters. This contrasts with alternative techniques, notably model checking and runtime verification, that suffer from the state-explosion problem or that otherwise depend on parameters to the program itself. We experimentally evaluated our approach against state-of-the-art tools for MPI to conclude that our approach offers a scalable solution. },
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2814302&ftid=1637518&dwn=1&CFID=574974113&CFTOKEN=15072837},
  review = 	 {fbie: rejected <2016-01-13 11:36:11>},
}

@article{Lopez:2015:PVM:2858965.2814302,
  author =	 {L\'{o}pez, Hugo A. and Marques, Eduardo R. B. and
                  Martins, Francisco and Ng, Nicholas and Santos,
                  C{\'e}sar and Vasconcelos, Vasco Thudichum and
                  Yoshida, Nobuko},
  title =	 {Protocol-based Verification of Message-passing
                  Parallel Programs},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {October 2015},
  volume =	 50,
  number =	 10,
  month =	 oct,
  year =	 2015,
  issn =	 {0362-1340},
  pages =	 {280--298},
  numpages =	 19,
  url =		 {http://doi.acm.org/10.1145/2858965.2814302},
  doi =		 {10.1145/2858965.2814302},
  acmid =	 2814302,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Dependent types, MPI, Parallel programming, Program
                  verification, Session types},
  abstract = 	 { We present ParTypes, a type-based methodology for the verification of Message Passing Interface (MPI) programs written in the C programming language. The aim is to statically verify programs against protocol specifications, enforcing properties such as fidelity and absence of deadlocks. We develop a protocol language based on a dependent type system for message-passing parallel programs, which includes various communication operators, such as point-to-point messages, broadcast, reduce, array scatter and gather. For the verification of a program against a given protocol, the protocol is first translated into a representation read by VCC, a software verifier for C. We successfully verified several MPI programs in a running time that is independent of the number of processes or other input parameters. This contrasts with alternative techniques, notably model checking and runtime verification, that suffer from the state-explosion problem or that otherwise depend on parameters to the program itself. We experimentally evaluated our approach against state-of-the-art tools for MPI to conclude that our approach offers a scalable solution. },
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2814302&ftid=1637518&dwn=1&CFID=574974113&CFTOKEN=15072837},
  review = 	 {fbie: rejected <2016-01-13 11:36:16>},
}

@article{Albert:1988:CFA:62116.62121,
  author =	 {Albert, Eugene and Knobe, Kathleen and Lukas, Joan
                  D. and Steele,Jr., Guy L.},
  title =	 {Compiling Fortran 8x Array Features for the
                  Connection Machine Computer System},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {Sept. 1988},
  volume =	 23,
  number =	 9,
  month =	 jan,
  year =	 1988,
  issn =	 {0362-1340},
  pages =	 {42--56},
  numpages =	 15,
  url =		 {http://doi.acm.org/10.1145/62116.62121},
  doi =		 {10.1145/62116.62121},
  acmid =	 62121,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=62121&ftid=16986&dwn=1&CFID=574974113&CFTOKEN=15072837},
  fullTextFile = {.slirm_cache/Albert_1988_Compiling.pdf},
  notes = 	 {Too hardware specific.},
  review = 	 {fbie: rejected <2016-01-13 11:37:25>},
}

@inproceedings{Albert:1988:CFA:62115.62121,
  author =	 {Albert, Eugene and Knobe, Kathleen and Lukas, Joan
                  D. and Steele,Jr., Guy L.},
  title =	 {Compiling Fortran 8x Array Features for the
                  Connection Machine Computer System},
  booktitle =	 {Proceedings of the ACM/SIGPLAN Conference on
                  Parallel Programming: Experience with Applications,
                  Languages and Systems},
  series =	 {PPEALS '88},
  year =	 1988,
  isbn =	 {0-89791-276-4},
  location =	 {New Haven, Connecticut, USA},
  pages =	 {42--56},
  numpages =	 15,
  url =		 {http://doi.acm.org/10.1145/62115.62121},
  doi =		 {10.1145/62115.62121},
  acmid =	 62121,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=62121&ftid=16986&dwn=1&CFID=574974113&CFTOKEN=15072837},
  review = 	 {fbie: rejected <2016-01-13 11:37:29>},
}

@inproceedings{Chang:1998:ESP:277830.277845,
  author =	 {Chang, Rong-Guey and Chuang, Tyng-Ruey and Lee, Jenq
                  Kuen},
  title =	 {Efficient Support of Parallel Sparse Computation for
                  Array Intrinsic Functions of Fortran 90},
  booktitle =	 {Proceedings of the 12th International Conference on
                  Supercomputing},
  series =	 {ICS '98},
  year =	 1998,
  isbn =	 {0-89791-998-X},
  location =	 {Melbourne, Australia},
  pages =	 {45--52},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/277830.277845},
  doi =		 {10.1145/277830.277845},
  acmid =	 277845,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=277845&ftid=42494&dwn=1&CFID=574974113&CFTOKEN=15072837},
  fullTextFile = {.slirm_cache/Chang_1998_Efficient.pdf},
  notes = 	 {Focuses on distributed memory.},
  review = 	 {fbie: rejected <2016-01-13 11:38:48>},
}

@article{Bernecky:1984:FA:384283.801075,
  author =	 {Bernecky, Robert},
  title =	 {Function Arrays},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {June 1984},
  volume =	 14,
  number =	 4,
  month =	 jun,
  year =	 1984,
  issn =	 {0163-6006},
  pages =	 {53--56},
  numpages =	 4,
  url =		 {http://doi.acm.org/10.1145/384283.801075},
  doi =		 {10.1145/384283.801075},
  acmid =	 801075,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  notes = 	 {Paper is available at ACM. Bernecky proposes arrays of functions to model MIMD. Seems not relevant.},
  review = 	 {fbie: rejected <2016-01-13 11:42:28>},
}

@inproceedings{Bernecky:1984:FA:800058.801075,
  author =	 {Bernecky, Robert},
  title =	 {Function Arrays},
  booktitle =	 {Proceedings of the International Conference on APL},
  series =	 {APL '84},
  year =	 1984,
  isbn =	 {0-89791-137-7},
  location =	 {Finland},
  pages =	 {53--56},
  numpages =	 4,
  url =		 {http://doi.acm.org/10.1145/800058.801075},
  doi =		 {10.1145/800058.801075},
  acmid =	 801075,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  review = 	 {fbie: rejected <2016-01-13 11:42:31>},
}

@article{Benkard:1984:SEA:384283.801074,
  author =	 {Benkard, J. Philip},
  title =	 {Syntactic Experiments with Arrays of Functions and
                  Operators},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {June 1984},
  volume =	 14,
  number =	 4,
  month =	 jun,
  year =	 1984,
  issn =	 {0163-6006},
  pages =	 {41--51},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/384283.801074},
  doi =		 {10.1145/384283.801074},
  acmid =	 801074,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  notes = 	 {Describes the syntactic model of APL2.},
  review = 	 {fbie: rejected <2016-01-13 11:47:35>},
}

@inproceedings{Benkard:1984:SEA:800058.801074,
  author =	 {Benkard, J. Philip},
  title =	 {Syntactic Experiments with Arrays of Functions and
                  Operators},
  booktitle =	 {Proceedings of the International Conference on APL},
  series =	 {APL '84},
  year =	 1984,
  isbn =	 {0-89791-137-7},
  location =	 {Finland},
  pages =	 {41--51},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/800058.801074},
  doi =		 {10.1145/800058.801074},
  acmid =	 801074,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  review = 	 {fbie: rejected <2016-01-13 11:47:38>},
}

@inproceedings{Fumero:2015:RCG:2807426.2807428,
  author =	 {Fumero, Juan Jos{\'e} and Remmelg, Toomas and
                  Steuwer, Michel and Dubach, Christophe},
  title =	 {Runtime Code Generation and Data Management for
                  Heterogeneous Computing in Java},
  booktitle =	 {Proceedings of the Principles and Practices of
                  Programming on The Java Platform},
  series =	 {PPPJ '15},
  year =	 2015,
  isbn =	 {978-1-4503-3712-0},
  location =	 {Melbourne, FL, USA},
  pages =	 {16--26},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/2807426.2807428},
  doi =		 {10.1145/2807426.2807428},
  acmid =	 2807428,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Algorithmic Skeletons, Code Generation, GPGPU,
                  Heterogeneous Systems, Parallel Patterns},
  abstract = 	 {GPUs (Graphics Processing Unit) and other accelerators are nowadays commonly found in desktop machines, mobile devices and even data centres. While these highly parallel processors offer high raw performance, they also dramatically increase program complexity, requiring extra effort from programmers. This results in difficult-to-maintain and non-portable code due to the low-level nature of the languages used to program these devices. This paper presents a high-level parallel programming approach for the popular Java programming language. Our goal is to revitalise the old Java slogan -- Write once, run anywhere --- in the context of modern heterogeneous systems. To enable the use of parallel accelerators from Java we introduce a new API for heterogeneous programming based on array and functional programming. Applications written with our API can then be transparently accelerated on a device such as a GPU using our runtime OpenCL code generator. In order to ensure the highest level of performance, we present data management optimizations. Usually, data has to be translated (marshalled) between the Java representation and the representation accelerators use. This paper shows how marshal affects runtime and present a novel technique in Java to avoid this cost by implementing our own customised array data structure. Our design hides low level data management from the user making our approach applicable even for inexperienced Java programmers. We evaluated our technique using a set of applications from different domains, including mathematical finance and machine learning. We achieve speedups of up to 500&times; over sequential and multi-threaded Java code when using an external GPU.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2807428&ftid=1619300&dwn=1&CFID=574974113&CFTOKEN=15072837},
  review = 	 {fbie: rejected <2016-01-13 11:49:00>},
}

@inproceedings{Weiss:1989:AID:99370.99389,
  author =	 {Weiss, Shlomit and Spillinger, Ilan and Siberman,
                  Gabriel M.},
  title =	 {Architectural Improvements for Data-driven VLSI
                  Processing Arrays},
  booktitle =	 {Proceedings of the Fourth International Conference
                  on Functional Programming Languages and Computer
                  Architecture},
  series =	 {FPCA '89},
  year =	 1989,
  isbn =	 {0-89791-328-0},
  location =	 {Imperial College, London, United Kingdom},
  pages =	 {243--259},
  numpages =	 17,
  url =		 {http://doi.acm.org/10.1145/99370.99389},
  doi =		 {10.1145/99370.99389},
  acmid =	 99389,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  review = 	 {fbie: rejected <2016-01-13 12:05:14>},
}

@article{Shafarenko:2000:SFA:969781.969785,
  author =	 {Shafarenko, A.},
  title =	 {A Symmetry-based Formalism for Array Subtyping},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {March 2001},
  volume =	 31,
  number =	 3,
  month =	 jul,
  year =	 2000,
  issn =	 {0163-6006},
  pages =	 {41--52},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/570475.969785},
  doi =		 {10.1145/570475.969785},
  acmid =	 969785,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  notes = 	 {The abstract mentiones distributed arrays.},
  review = 	 {fbie: rejected <2016-01-13 12:09:28>},
}

@inproceedings{Shafarenko:2000:SFA:570475.969785,
  author =	 {Shafarenko, A.},
  title =	 {A Symmetry-based Formalism for Array Subtyping},
  booktitle =	 {Proceedings of the International Conference on
                  APL-Berlin-2000 Conference},
  series =	 {APL '00},
  year =	 2000,
  isbn =	 {1-58113-182-8},
  location =	 {Berlin, Germany},
  pages =	 {41--52},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/570475.969785},
  doi =		 {10.1145/570475.969785},
  acmid =	 969785,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  review = 	 {fbie: rejected <2016-01-13 12:09:31>},
}

@inproceedings{Feautrier:1993:TAP:165939.165968,
  author =	 {Feautrier, Paul},
  title =	 {Toward Automatic Partitioning of Arrays on
                  Distributed Memory Computers},
  booktitle =	 {Proceedings of the 7th International Conference on
                  Supercomputing},
  series =	 {ICS '93},
  year =	 1993,
  isbn =	 {0-89791-600-X},
  location =	 {Tokyo, Japan},
  pages =	 {175--184},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/165939.165968},
  doi =		 {10.1145/165939.165968},
  acmid =	 165968,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=165968&ftid=28329&dwn=1&CFID=574981389&CFTOKEN=13307282},
  review = 	 {fbie: rejected <2016-01-13 12:09:40>},
}

@inproceedings{Rangan:2004:DSP:1025127.1026007,
  author =	 {Rangan, Ram and Vachharajani, Neil and Vachharajani,
                  Manish and August, David I.},
  title =	 {Decoupled Software Pipelining with the
                  Synchronization Array},
  booktitle =	 {Proceedings of the 13th International Conference on
                  Parallel Architectures and Compilation Techniques},
  series =	 {PACT '04},
  year =	 2004,
  isbn =	 {0-7695-2229-7},
  pages =	 {177--188},
  numpages =	 12,
  url =		 {http://dx.doi.org/10.1109/PACT.2004.14},
  doi =		 {10.1109/PACT.2004.14},
  acmid =	 1026007,
  publisher =	 {IEEE Computer Society},
  address =	 {Washington, DC, USA},
  notes = 	 {I cannot access the paper.},
  review = 	 {fbie: rejected <2016-01-13 12:10:10>},
}

@article{Halbwachs:2008:DPA:1379022.1375623,
  author =	 {Halbwachs, Nicolas and P{\'e}ron, Mathias},
  title =	 {Discovering Properties About Arrays in Simple
                  Programs},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {June 2008},
  volume =	 43,
  number =	 6,
  month =	 jun,
  year =	 2008,
  issn =	 {0362-1340},
  pages =	 {339--348},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/1379022.1375623},
  doi =		 {10.1145/1379022.1375623},
  acmid =	 1375623,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {"sentinel", abstract interpretation, arrays,
                  invariant synthesis, program verification, sorting
                  algorithms},
  abstract = 	 {Array bound checking and array dependency analysis (for parallelization) have been widely studied. However, there are much less results about analyzing properties of array contents. In this paper, we propose a way of using abstract interpretation for discovering properties about array contents in some restricted cases: one-dimensional arrays, traversed by simple "for" loops. The basic idea, borrowed from [GRS05], consists in partitioning arrays into symbolic intervals (e.g., [1,i -- 1], [i,i], [i + 1,n]), and in associating with each such interval I and each array A an abstract variable AI; the new idea is to consider relational abstract properties &#968;(AI, BI, ...) about these abstract variables, and to interpret such a property pointwise on the interval I: &#8704;l &#8712; I, &#968;(A[l], B[l],...). The abstract semantics of our simple programs according to these abstract properties has been defined and implemented in a prototype tool. The method is able, for instance, to discover that the result of an insertion sort is a sorted array, or that, in an array traversal guarded by a "sentinel", the index stays within the bounds.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=1375623&ftid=532286&dwn=1&CFID=574981389&CFTOKEN=13307282},
  fullTextFile = {.slirm_cache/Halbwachs_2008_Discovering-Properties.pdf},
  notes = 	 {Seems to focused on verification.},
  review = 	 {fbie: rejected <2016-01-13 12:58:16>},
}

@inproceedings{Halbwachs:2008:DPA:1375581.1375623,
  author =	 {Halbwachs, Nicolas and P{\'e}ron, Mathias},
  title =	 {Discovering Properties About Arrays in Simple
                  Programs},
  booktitle =	 {Proceedings of the 29th ACM SIGPLAN Conference on
                  Programming Language Design and Implementation},
  series =	 {PLDI '08},
  year =	 2008,
  isbn =	 {978-1-59593-860-2},
  location =	 {Tucson, AZ, USA},
  pages =	 {339--348},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/1375581.1375623},
  doi =		 {10.1145/1375581.1375623},
  acmid =	 1375623,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {"sentinel", abstract interpretation, arrays,
                  invariant synthesis, program verification, sorting
                  algorithms},
  abstract = 	 {Array bound checking and array dependency analysis (for parallelization) have been widely studied. However, there are much less results about analyzing properties of array contents. In this paper, we propose a way of using abstract interpretation for discovering properties about array contents in some restricted cases: one-dimensional arrays, traversed by simple "for" loops. The basic idea, borrowed from [GRS05], consists in partitioning arrays into symbolic intervals (e.g., [1,i -- 1], [i,i], [i + 1,n]), and in associating with each such interval I and each array A an abstract variable AI; the new idea is to consider relational abstract properties &#968;(AI, BI, ...) about these abstract variables, and to interpret such a property pointwise on the interval I: &#8704;l &#8712; I, &#968;(A[l], B[l],...). The abstract semantics of our simple programs according to these abstract properties has been defined and implemented in a prototype tool. The method is able, for instance, to discover that the result of an insertion sort is a sorted array, or that, in an array traversal guarded by a "sentinel", the index stays within the bounds.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=1375623&ftid=532286&dwn=1&CFID=574981389&CFTOKEN=13307282},
  review = 	 {fbie: rejected <2016-01-13 12:58:21>},
}

@article{Arvind:1989:IDS:69558.69562,
  author =	 {Arvind and Nikhil, Rishiyur S. and Pingali, Keshav
                  K.},
  title =	 {I-structures: Data Structures for Parallel
                  Computing},
  journal =	 {ACM Trans. Program. Lang. Syst.},
  issue_date =	 {Oct. 1989},
  volume =	 11,
  number =	 4,
  month =	 oct,
  year =	 1989,
  issn =	 {0164-0925},
  pages =	 {598--632},
  numpages =	 35,
  url =		 {http://doi.acm.org/10.1145/69558.69562},
  doi =		 {10.1145/69558.69562},
  acmid =	 69562,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=69562&ftid=19522&dwn=1&CFID=574981389&CFTOKEN=13307282},
  fullTextFile = {.slirm_cache/Arvind and Nikhil_1989_I-structures.pdf},
  notes = 	 {I am not quite sure about this one, but it seems an interesting paper.},
  review = 	 {fbie: accepted <2016-01-13 13:02:55>},
}

@inproceedings{Knobe:1998:ASF:268946.268956,
  author =	 {Knobe, Kathleen and Sarkar, Vivek},
  title =	 {Array SSA Form and Its Use in Parallelization},
  booktitle =	 {Proceedings of the 25th ACM SIGPLAN-SIGACT Symposium
                  on Principles of Programming Languages},
  series =	 {POPL '98},
  year =	 1998,
  isbn =	 {0-89791-979-3},
  location =	 {San Diego, California, USA},
  pages =	 {107--120},
  numpages =	 14,
  url =		 {http://doi.acm.org/10.1145/268946.268956},
  doi =		 {10.1145/268946.268956},
  acmid =	 268956,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=268956&ftid=33536&dwn=1&CFID=574981389&CFTOKEN=13307282},
  fullTextFile = {.slirm_cache/Knobe_1998_Array-SSA.pdf},
  notes = 	 {SSA stands for "single static assignment".},
  review = 	 {fbie: accepted <2016-01-13 13:04:58>},
}

@inproceedings{Dinan:2011:PHO:2148600.2148620,
  author =	 {Dinan, James and Balaji, Pavan and Hammond, Jeff
                  R. and Krishnamoorthy, Sriram and Tipparaju, Vinod},
  title =	 {Poster: High-level, One-sided Programming Models on
                  MPI: A Case Study with Global Arrays and NWChem},
  booktitle =	 {Proceedings of the 2011 Companion on High
                  Performance Computing Networking, Storage and
                  Analysis Companion},
  series =	 {SC '11 Companion},
  year =	 2011,
  isbn =	 {978-1-4503-1030-7},
  location =	 {Seattle, Washington, USA},
  pages =	 {37--38},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/2148600.2148620},
  doi =		 {10.1145/2148600.2148620},
  acmid =	 2148620,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {ARMCI, MPI, RMA, global arrays, one-sided
                  communication},
  abstract = 	 {Global Arrays (GA) is popular high-level parallel programming model that provides data and computation management facilities to the NWChem computational chemistry suite. GA's global-view data model is supported by the ARMCI partitioned global address space runtime system, which traditionally is implemented natively on each supported platform in order to provide the best performance. The industry standard Message Passing Interface (MPI) also provides one-sided functionality and is available on virtually every supercomputing system. We present the first high performance, portable implementation of ARMCI using MPI one-sided communication. We interface the existing GA infrastructure with ARMCI-MPI and demonstrate that this approach reduces the amount of resources consumed by the runtime system, provides comparable performance, and enhances portability for applications like NWChem.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2148620&ftid=1149531&dwn=1&CFID=574981389&CFTOKEN=13307282},
  review = 	 {fbie: rejected <2016-01-13 13:05:52>},
}

@article{Cao:1994:TPR:185514.185517,
  author =	 {Cao, Pei and Lin, Swee Boon and Venkataraman,
                  Shivakumar and Wilkes, John},
  title =	 {The TickerTAIP Parallel RAID Architecture},
  journal =	 {ACM Trans. Comput. Syst.},
  issue_date =	 {Aug. 1994},
  volume =	 12,
  number =	 3,
  month =	 aug,
  year =	 1994,
  issn =	 {0734-2071},
  pages =	 {236--269},
  numpages =	 34,
  url =		 {http://doi.acm.org/10.1145/185514.185517},
  doi =		 {10.1145/185514.185517},
  acmid =	 185517,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {RAID disk array, decentralized parity calculation,
                  disk scheduling, distributed controller, fault
                  tolerance, parallel controller, performance
                  simulation},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=185517&ftid=27950&dwn=1&CFID=574981389&CFTOKEN=13307282},
  review = 	 {fbie: rejected <2016-01-13 13:06:12>},
}

@article{Mainland:2010:NEC:2088456.1863533,
  author =	 {Mainland, Geoffrey and Morrisett, Greg},
  title =	 {Nikola: Embedding Compiled GPU Functions in Haskell},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {November 2010},
  volume =	 45,
  number =	 11,
  month =	 sep,
  year =	 2010,
  issn =	 {0362-1340},
  pages =	 {67--78},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2088456.1863533},
  doi =		 {10.1145/2088456.1863533},
  acmid =	 1863533,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {cuda, gpu, meta programming},
  abstract = 	 {We describe Nikola, a first-order language of array computations embedded in Haskell that compiles to GPUs via CUDA using a new set of type-directed techniques to support re-usable computations. Nikola automatically handles a range of low-level details for Haskell programmers, such as marshaling data to/from the GPU, size inference for buffers, memory management, and automatic loop parallelization. Additionally, Nikola supports both compile-time and run-time code generation, making it possible for programmers to choose when and where to specialize embedded programs.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=1863533&ftid=845231&dwn=1&CFID=574981389&CFTOKEN=13307282},
  review = 	 {fbie: rejected <2016-01-13 13:06:24>},
}

@inproceedings{Mainland:2010:NEC:1863523.1863533,
  author =	 {Mainland, Geoffrey and Morrisett, Greg},
  title =	 {Nikola: Embedding Compiled GPU Functions in Haskell},
  booktitle =	 {Proceedings of the Third ACM Haskell Symposium on
                  Haskell},
  series =	 {Haskell '10},
  year =	 2010,
  isbn =	 {978-1-4503-0252-4},
  location =	 {Baltimore, Maryland, USA},
  pages =	 {67--78},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/1863523.1863533},
  doi =		 {10.1145/1863523.1863533},
  acmid =	 1863533,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {cuda, gpu, meta programming},
  abstract = 	 {We describe Nikola, a first-order language of array computations embedded in Haskell that compiles to GPUs via CUDA using a new set of type-directed techniques to support re-usable computations. Nikola automatically handles a range of low-level details for Haskell programmers, such as marshaling data to/from the GPU, size inference for buffers, memory management, and automatic loop parallelization. Additionally, Nikola supports both compile-time and run-time code generation, making it possible for programmers to choose when and where to specialize embedded programs.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=1863533&ftid=845231&dwn=1&CFID=574981389&CFTOKEN=13307282},
  review = 	 {fbie: rejected <2016-01-13 13:06:28>},
}

@inproceedings{Sinkarovs:2013:SDL:2502323.2502332,
  author =	 {Sinkarovs, Artjoms and Scholz, Sven-Bodo},
  title =	 {Semantics-preserving Data Layout Transformations for
                  Improved Vectorisation},
  booktitle =	 {Proceedings of the 2Nd ACM SIGPLAN Workshop on
                  Functional High-performance Computing},
  series =	 {FHPC '13},
  year =	 2013,
  isbn =	 {978-1-4503-2381-9},
  location =	 {Boston, Massachusetts, USA},
  pages =	 {59--70},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2502323.2502332},
  doi =		 {10.1145/2502323.2502332},
  acmid =	 2502332,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {correctness, program transformation, type systems,
                  vectorisation},
  abstract = 	 {Data-Layouts that are favourable from an algorithmic perspective often are less suitable for vectorisation, i.e., for an effective use of modern processor's vector instructions. This paper presents work on a compiler driven approach towards automatically transforming data layouts into a form that is suitable for vectorisation. In particular, we present a program transformation for a first-order functional array programming language that systematically modifies they layouts of all data structures. At the same time, the transformation also adjusts the code that operates on these structures so that the overall computation remains unchanged. We define a correctness criterion for layout modifying program transformations and we show that our transformation abides to this criterion.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2502332&ftid=1397477&dwn=1&CFID=574981389&CFTOKEN=13307282},
  review = 	 {fbie: accepted <2016-01-13 13:09:45>},
}

@inproceedings{BenAsher:2014:MFS:2554688.2554699,
  author =	 {Ben Asher, Yosi and Gendel, Jacob and Haber, Gadi
                  and Segal, Oren and Shajrawi, Yousef},
  title =	 {1K Manycore FPGA Shared Memory Architecture for SOC
                  (Abstract Only)},
  booktitle =	 {Proceedings of the 2014 ACM/SIGDA International
                  Symposium on Field-programmable Gate Arrays},
  series =	 {FPGA '14},
  year =	 2014,
  isbn =	 {978-1-4503-2671-1},
  location =	 {Monterey, California, USA},
  pages =	 {251--251},
  numpages =	 1,
  url =		 {http://doi.acm.org/10.1145/2554688.2554699},
  doi =		 {10.1145/2554688.2554699},
  acmid =	 2554699,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {dmm, fpga, manycore, shared-memory, soc},
  abstract = 	 {Manycore shared memory architectures hold a significant premise to speed up and simplify SOCs. Using many homogeneous small-cores will allow replacing the hardware accelerators of SOCs by parallel algorithms communicating through shared memory. Currently shared memory is realized by maintaining cache-consistency across the cores, caching all the connected cores to one main memory module. This approach, though used today, is not likely to be scalable enough to support the high number of cores needed for highly parallel SOCs. Therefore we consider a theoretical scheme for shared memory wherein: the shared address space is divided between a set of memory modules; and a communication network allows each core to access every such module in parallel. Load-balancing between the memory modules is obtained by rehashing the memory address-space. We have designed a simple generic shared memory architecture, synthesized it to 2,4,8,,..1024-cores for FPGA virtex-7 and evaluated it on several parallel programs. The synthesis results and the execution measurements show that, for the FPGA, all problematic aspects of this construction can be resolved. For example, unlike ASICs, the growing complexity of the communication network is absorbed by the FPGA's routing grid and by its routing mechanism. This makes this type of architectures particularly suitable for FPGAs. We used 32-bits modified PACOBLAZE cores and tested different parameters of this architecture verifying its ability to achieve high speedups. The results suggest that re-hashing is not essential and one hash-function suffice (compared to the family of universal hash functions that is needed by the theoretical construction).},
  notes = 	 {There is no full text available for this publication anywhere.},
  review = 	 {fbie: rejected <2016-01-13 13:12:21>},
}

@article{Sastry:1994:PDU:182590.182486,
  author =	 {Sastry, A. V. S. and Clinger, William},
  title =	 {Parallel Destructive Updating in Strict Functional
                  Languages},
  journal =	 {SIGPLAN Lisp Pointers},
  issue_date =	 {July-Sept. 1994},
  volume =	 {VII},
  number =	 3,
  month =	 jul,
  year =	 1994,
  issn =	 {1045-3563},
  pages =	 {263--272},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/182590.182486},
  doi =		 {10.1145/182590.182486},
  acmid =	 182486,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=182486&ftid=27882&dwn=1&CFID=574981389&CFTOKEN=13307282},
  fullTextFile = {.slirm_cache/Sastry_1994_Parallel-Destructive.pdf},
  notes = 	 {Concerns array updates in functional languages. Author acknowledges that there has not been much research in this area and mentions amongst others SISAL as an exception.},
  review = 	 {fbie: accepted <2016-01-13 13:17:24>},
}

@inproceedings{Sastry:1994:PDU:182409.182486,
  author =	 {Sastry, A. V. S. and Clinger, William},
  title =	 {Parallel Destructive Updating in Strict Functional
                  Languages},
  booktitle =	 {Proceedings of the 1994 ACM Conference on LISP and
                  Functional Programming},
  series =	 {LFP '94},
  year =	 1994,
  isbn =	 {0-89791-643-3},
  location =	 {Orlando, Florida, USA},
  pages =	 {263--272},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/182409.182486},
  doi =		 {10.1145/182409.182486},
  acmid =	 182486,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=182486&ftid=27882&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: accepted <2016-01-13 13:17:29>},
}

@article{Erickson:1975:APA:390015.808397,
  author =	 {Erickson, David B.},
  title =	 {Array Processing on an Array Processor},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {March 1975},
  volume =	 10,
  number =	 3,
  month =	 jan,
  year =	 1975,
  issn =	 {0362-1340},
  pages =	 {17--24},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/390015.808397},
  doi =		 {10.1145/390015.808397},
  acmid =	 808397,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=808397&ftid=150139&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Erickson_1975_Array-Processing.pdf},
  notes = 	 {Focuses on concrete hardware architecture.},
  review = 	 {fbie: rejected <2016-01-13 13:18:15>},
}

@inproceedings{Erickson:1975:APA:800026.808397,
  author =	 {Erickson, David B.},
  title =	 {Array Processing on an Array Processor},
  booktitle =	 {Proceedings of the Conference on Programming
                  Languages and Compilers for Parallel and Vector
                  Machines},
  year =	 1975,
  location =	 {New York, New York},
  pages =	 {17--24},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/800026.808397},
  doi =		 {10.1145/800026.808397},
  acmid =	 808397,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=808397&ftid=150139&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:18:23>},
}

@inproceedings{Panda:2012:DEC:2145694.2145755,
  author =	 {Panda, Robin and Hauck, Scott},
  title =	 {Dataflow-driven Execution Control in a
                  Coarse-grained Reconfigurable Array (Abstract Only)},
  booktitle =	 {Proceedings of the ACM/SIGDA International Symposium
                  on Field Programmable Gate Arrays},
  series =	 {FPGA '12},
  year =	 2012,
  isbn =	 {978-1-4503-1155-7},
  location =	 {Monterey, California, USA},
  pages =	 {269--269},
  numpages =	 1,
  url =		 {http://doi.acm.org/10.1145/2145694.2145755},
  doi =		 {10.1145/2145694.2145755},
  acmid =	 2145755,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {architecture, cgra, mppa},
  abstract = 	 {Coarse Grained Reconfigurable Arrays (CGRAs) are a promising class of architectures for accelerating applications using a large number of parallel execution units for high throughput. While they are typically very efficient for a single task, all functional units are required to perform in lock step; this makes some classes of applications more difficult to program and efficiently use resources. Other architectures like Massively Parallel Processor Arrays (MPPAs) are better suited for these applications and excel at executing unrelated tasks simultaneously, but the amount of resources dedicated to a single task is limited. We are developing a new architecture with the design flexibility of an MPPA and the throughput of a CGRA. A key to the flexibility of MPPAs is the ability for subtasks to execute independently instead of in lock step with all other tasks on the array. Adding this capability requires special control circuitry for architectural support in a CGRA. We decribe the modifications required and our solutions. Additionally, we also describe the CAD tool modification and application developer concerns for utilizing the resulting hybrid CGRA/MPPA architecture.},
  notes = 	 {Hardware architecture.},
  review = 	 {fbie: rejected <2016-01-13 13:19:30>},
}

@article{Mueller:2012:ABA:2234336.2234339,
  author =	 {Mueller, Conrad},
  title =	 {Axiom Based Architecture},
  journal =	 {SIGARCH Comput. Archit. News},
  issue_date =	 {May 2012},
  volume =	 40,
  number =	 2,
  month =	 may,
  year =	 2012,
  issn =	 {0163-5964},
  pages =	 {10--17},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/2234336.2234339},
  doi =		 {10.1145/2234336.2234339},
  acmid =	 2234339,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {computational model, empirical, fine-grain
                  parallelism, programming model},
  abstract = 	 {The paper proposes an axiom based architecture as an alternative to the von Neumann model. The model has many desirable properties: fine-grained parallelism, simple semantics, better security and easy of programming. The empirical research gives some indication of its performance potential. A description is given as to how algebraic arithmetic expressions of relations can be broken up into primitive expressions consisting of a single operation. These primitive relations are shown to be sufficient to describe a Turing machine. Eight inference rules are given that define how the primitive relations can be evaluated. An outline is given of an architecture based on these inference rules. Finally a brief description is given of an experimental emulation and empirical evaluation of the architecture. Instead of manipulating data or values by applying instructions or functions, computation is applying existing elements to relations to create new elements. The element's identifier determines which relations the element applies to. The relation determines the identifier of the new element and the operation that needs to be applied to create the value of the new element. The conceptually indices are different in this model. Instead of seeing an index as an offset into an array, an index is seen as part of the element identifier. This enables infinitely many relations to be defined between unique sets using universal quantifiers. Thus every element, or value, computed has a unique description.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2234339&ftid=1224613&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:20:06>},
}

@inproceedings{Wen:2012:MMA:2145694.2145741,
  author =	 {Wen, Mei and Wu, Nan and Yang, Qianming and Zhang,
                  Chunyuan and Zhao, Liang},
  title =	 {The Masala Machine: Accelerating Thread-intensive
                  and Explicit Memory Management Programs with
                  Dynamically Reconfigurable FPGAs (Abstract Only)},
  booktitle =	 {Proceedings of the ACM/SIGDA International Symposium
                  on Field Programmable Gate Arrays},
  series =	 {FPGA '12},
  year =	 2012,
  isbn =	 {978-1-4503-1155-7},
  location =	 {Monterey, California, USA},
  pages =	 {265--265},
  numpages =	 1,
  url =		 {http://doi.acm.org/10.1145/2145694.2145741},
  doi =		 {10.1145/2145694.2145741},
  acmid =	 2145741,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {accelerator, explicit memory management, fpga,
                  masala, sar, thread-intensive},
  abstract = 	 {A uniform FPGA-based architecture, an efficient programming model and a simple mapping method are paramount for PPGA technology to be more widely accepted. This paper presents MASALA, a dynamically reconfigurable FPGA-based accelerator specifically for parallel programs written in thread-intensive and explicit memory management (TEMM) programming models. The system uses TEMM programming model to parallelize the demanding application, including decomposing the application into separate thread blocks, decoupling compute and data load/store etc. Hardware engines are included into the MASALA by using partial dynamic reconfigure modules, each of which encapsulates Thread Process Engine implementing the thread functionality in hardware. A data dispatching scheme is also included in MASALA to enable the explicit communication among multiple memory hierarchies such as between inter-hardware engines, the host processor and hardware engines. At last, the paper illustrates a Multi-FPGA prototype system of the presented architecture: MASALA-SX. A large synthetic aperture radar (SAR) image formatting experiment shows that the MASALA architecture facilitates the construction of a TEMM program accelerator by providing it with greater performance and less power consumption than current CPU platforms, but without sacrificing programmability, flexibility and scalability.},
  review = 	 {fbie: rejected <2016-01-13 13:20:14>},
}

@article{Siegel:2011:AFV:2038037.1941603,
  author =	 {Siegel, Stephen F. and Zirkel, Timothy K.},
  title =	 {Automatic Formal Verification of MPI-based Parallel
                  Programs},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {August 2011},
  volume =	 46,
  number =	 8,
  month =	 feb,
  year =	 2011,
  issn =	 {0362-1340},
  pages =	 {309--310},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/2038037.1941603},
  doi =		 {10.1145/2038037.1941603},
  acmid =	 1941603,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {debugging, message-passing, mpi, symbolic execution,
                  verification},
  abstract = 	 {The Toolkit for Accurate Scientific Software (TASS) is a suite of tools for the formal verification of MPI-based parallel programs used in computational science. TASS can verify various safety properties as well as compare two programs for functional equivalence. The TASS front end takes an integer n &#8805; 1 and a C/MPI program, and constructs an abstract model of the program with n processes. Procedures, structs, (multi-dimensional) arrays, heap-allocated data, pointers, and pointer arithmetic are all representable in a TASS model. The model is then explored using symbolic execution and explicit state space enumeration. A number of techniques are used to reduce the time and memory consumed. A variety of realistic MPI programs have been verified with TASS, including Jacobi iteration and manager-worker type programs, and some subtle defects have been discovered. TASS is written in Java and is available from http://vsl.cis.udel.edu/tass under the Gnu Public License.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=1941603&ftid=922028&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:20:20>},
}

@inproceedings{Siegel:2011:AFV:1941553.1941603,
  author =	 {Siegel, Stephen F. and Zirkel, Timothy K.},
  title =	 {Automatic Formal Verification of MPI-based Parallel
                  Programs},
  booktitle =	 {Proceedings of the 16th ACM Symposium on Principles
                  and Practice of Parallel Programming},
  series =	 {PPoPP '11},
  year =	 2011,
  isbn =	 {978-1-4503-0119-0},
  location =	 {San Antonio, TX, USA},
  pages =	 {309--310},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/1941553.1941603},
  doi =		 {10.1145/1941553.1941603},
  acmid =	 1941603,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {debugging, message-passing, mpi, symbolic execution,
                  verification},
  abstract = 	 {The Toolkit for Accurate Scientific Software (TASS) is a suite of tools for the formal verification of MPI-based parallel programs used in computational science. TASS can verify various safety properties as well as compare two programs for functional equivalence. The TASS front end takes an integer n &#8805; 1 and a C/MPI program, and constructs an abstract model of the program with n processes. Procedures, structs, (multi-dimensional) arrays, heap-allocated data, pointers, and pointer arithmetic are all representable in a TASS model. The model is then explored using symbolic execution and explicit state space enumeration. A number of techniques are used to reduce the time and memory consumed. A variety of realistic MPI programs have been verified with TASS, including Jacobi iteration and manager-worker type programs, and some subtle defects have been discovered. TASS is written in Java and is available from http://vsl.cis.udel.edu/tass under the Gnu Public License.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=1941603&ftid=922028&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:20:24>},
}

@inproceedings{Siegel:2015:CCI:2807591.2807635,
  author =	 {Siegel, Stephen F. and Zheng, Manchun and Luo,
                  Ziqing and Zirkel, Timothy K. and Marianiello, Andre
                  V. and Edenhofner, John G. and Dwyer, Matthew B. and
                  Rogers, Michael S.},
  title =	 {CIVL: The Concurrency Intermediate Verification
                  Language},
  booktitle =	 {Proceedings of the International Conference for High
                  Performance Computing, Networking, Storage and
                  Analysis},
  series =	 {SC '15},
  year =	 2015,
  isbn =	 {978-1-4503-3723-6},
  location =	 {Austin, Texas},
  pages =	 {61:1--61:12},
  articleno =	 61,
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2807591.2807635},
  doi =		 {10.1145/2807591.2807635},
  acmid =	 2807635,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {CUDA, MPI, OpenMP, concurrency, intermediate
                  representation, model checking, parallel
                  programming, program transformation, pthreads,
                  symbolic execution, verification},
  abstract = 	 {There are many ways to express parallel programs: message-passing libraries (MPI) and multithreading/GPU language extensions such as OpenMP, Pthreads, and CUDA, are but a few. This multitude creates a serious challenge for developers of software verification tools: it takes enormous effort to develop such tools, but each development effort typically targets one small part of the concurrency landscape, with little sharing of techniques and code among efforts. To address this problem, we present CIVL: the Concurrency Intermediate Verification Language. CIVL provides a general concurrency model capable of representing programs in a variety of concurrency dialects, including those listed above. The CIVL framework currently includes front-ends for the four dialects, and a back-end verifier which uses model checking and symbolic execution to check a number of properties, including the absence of deadlocks, race conditions, assertion violations, illegal pointer dereferences and arithmetic, memory leaks, divisions by zero, and out-of-bound array indexing; it can also check that two programs are functionally equivalent.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2807635&ftid=1632025&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:20:58>},
}

@inproceedings{Kadayif:2002:ILP:513918.514096,
  author =	 {Kadayif, I. and Kandemir, M. and Sezer, U.},
  title =	 {An Integer Linear Programming Based Approach for
                  Parallelizing Applications in On-chip
                  Multiprocessors},
  booktitle =	 {Proceedings of the 39th Annual Design Automation
                  Conference},
  series =	 {DAC '02},
  year =	 2002,
  isbn =	 {1-58113-461-4},
  location =	 {New Orleans, Louisiana, USA},
  pages =	 {703--706},
  numpages =	 4,
  url =		 {http://doi.acm.org/10.1145/513918.514096},
  doi =		 {10.1145/513918.514096},
  acmid =	 514096,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {constraint-based compilation, embedded systems,
                  loop-Level parallelism},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=514096&ftid=72110&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Kadayif_2002_An-Integer.pdf},
  notes = 	 {Automatic parallelization of array-intensive languages with multiple constraints (here performance and energy consumption).},
  review = 	 {fbie: accepted <2016-01-13 13:22:47>},
}

@inproceedings{Feautrier:2014:ARA:2591635.2591641,
  author =	 {Feautrier, Paul},
  title =	 {Author Retrospective for Array Expansion, Array
                  Shrinking, or There and Back Again},
  booktitle =	 {ACM International Conference on Supercomputing 25th
                  Anniversary Volume},
  year =	 2014,
  isbn =	 {978-1-4503-2840-1},
  location =	 {Munich, Germany},
  pages =	 {6--6},
  numpages =	 1,
  url =		 {http://doi.acm.org/10.1145/2591635.2591641},
  doi =		 {10.1145/2591635.2591641},
  acmid =	 2591641,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {In the late 1980's, I was recovering from a long stint as the manager of Paris University computing facility (yes, there were still computing facilities, and PCs were just coming of age) and I was teaching old fashioned automatic parallelization to postgraduate students. I had heard of scalar expansion and privatization, and it was a natural question whether it could be extended to arrays. I first concocted partial solutions - for instance, valid only for the innermost loop - and then decided to try for the general case: converting array accesses to single assignment form. I soon realized that this implied finding the source, or last write before a given read, and that the solution must be a function of the position of the read in the temporal execution of the program. It was obvious that this could not be done for arbitrary complex programs, hence I specified a set of restriction: the polyhedral model. I also introduced the execution order, now known as the 'happens-before' relation. Finding the last write then became an integer programming problem with some unfamiliar features: lexicographic order took the place of the economic function, the problem had to be solved exactly, and the coordinates of the read operation were acting as parameters. Hence, I had first to build PIP (a parametric integer programming tool [2]) before solving my problem. PIP was developed on a 80286 PC, using Borland TurboC and LeLisp. It then took me about two years to have an improved form of the ICS paper published by a journal [3]. Here, the emphasis was more on single assignment conversion and its use for program comprehension. I also formalized a comparison algorithm, which is needed when there are several potential sources. But it was not until [4] that I managed to prove its termination. Meanwhile, the ICS paper had attracted attention from the other side of the Atlantic. Most important was Bill Pugh's work [6], in which the problem was reformulated in term of affine relations, and solved by Bill's own linear programming tool, Omega. I remember that we exchanged our benchmarks, and found that our results were equivalent. An early example of reproducible research!!},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2591641&ftid=1498111&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Feautrier_2014_Author-Retrospective.pdf},
  notes = 	 {Really just one page of text, not very informative. The references might be of interest, though.},
  review = 	 {fbie: rejected <2016-01-13 13:26:03>},
}

@inproceedings{Morihata:2012:CDN:2342685.2342703,
  author =	 {Morihata, Akimasa},
  title =	 {Calculational Developments of New Parallel
                  Algorithms for Size-constrained Maximum-sum Segment
                  Problems},
  booktitle =	 {Proceedings of the 11th International Conference on
                  Functional and Logic Programming},
  series =	 {FLOPS'12},
  year =	 2012,
  isbn =	 {978-3-642-29821-9},
  location =	 {Kobe, Japan},
  pages =	 {213--227},
  numpages =	 15,
  url =		 {http://dx.doi.org/10.1007/978-3-642-29822-6_18},
  doi =		 {10.1007/978-3-642-29822-6_18},
  acmid =	 2342703,
  publisher =	 {Springer-Verlag},
  address =	 {Berlin, Heidelberg},
  review = 	 {fbie: rejected <2016-01-14 12:16:19>},
}

@inproceedings{Cao:2010:TCN:1787275.1787277,
  author =	 {Cao, Yong and Patnaik, Debprakash and Ponce, Sean
                  and Archuleta, Jeremy and Butler, Patrick and Feng,
                  Wu-chun and Ramakrishnan, Naren},
  title =	 {Towards Chip-on-chip Neuroscience: Fast Mining of
                  Neuronal Spike Streams Using Graphics Hardware},
  booktitle =	 {Proceedings of the 7th ACM International Conference
                  on Computing Frontiers},
  series =	 {CF '10},
  year =	 2010,
  isbn =	 {978-1-4503-0044-5},
  location =	 {Bertinoro, Italy},
  pages =	 {1--10},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/1787275.1787277},
  doi =		 {10.1145/1787275.1787277},
  acmid =	 1787277,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {computation-to-core mapping, computational
                  neuroscience, graphics processing unit, temporal
                  data mining},
  abstract = 	 {Computational neuroscience is being revolutionized with the advent of multi-electrode arrays that provide real-time, dynamic perspectives into brain function. Mining neuronal spike streams from these chips is critical to understand the firing patterns of neurons and gain insight into the underlying cellular activity. To address this need, we present a solution that uses a massively parallel graphics processing unit (GPU) to mine the stream of spikes. We focus on mining frequent episodes that capture coordinated events across time even in the presence of intervening background events. Our contributions include new computation-to-core mapping schemes and novel strategies to map finite state machine-based counting algorithms onto the GPU. Together, these contributions move us towards a real-time 'chip-on-chip' solution to neuroscience data mining, where one chip (the multi-electrode array) supplies the spike train data and another chip (the GPU) mines it at a scale previously unachievable.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=1787277&ftid=802035&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:33:50>},
}

@inproceedings{Lam:1988:COA:73560.73587,
  author =	 {Lam, M.},
  title =	 {Compiler Optimizations for Asynchronous Systolic
                  Array Programs},
  booktitle =	 {Proceedings of the 15th ACM SIGPLAN-SIGACT Symposium
                  on Principles of Programming Languages},
  series =	 {POPL '88},
  year =	 1988,
  isbn =	 {0-89791-252-7},
  location =	 {San Diego, California, USA},
  pages =	 {309--318},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/73560.73587},
  doi =		 {10.1145/73560.73587},
  acmid =	 73587,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=73587&ftid=15907&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Lam_1988_Compiler-Optimizations.pdf},
  notes = 	 {Focusing on specific architecture.},
  review = 	 {fbie: rejected <2016-01-13 13:35:32>},
}

@article{Rugina:2003:PAS:596980.596982,
  author =	 {Rugina, Radu and Rinard, Martin C.},
  title =	 {Pointer Analysis for Structured Parallel Programs},
  journal =	 {ACM Trans. Program. Lang. Syst.},
  issue_date =	 {January 2003},
  volume =	 25,
  number =	 1,
  month =	 jan,
  year =	 2003,
  issn =	 {0164-0925},
  pages =	 {70--116},
  numpages =	 47,
  url =		 {http://doi.acm.org/10.1145/596980.596982},
  doi =		 {10.1145/596980.596982},
  acmid =	 596982,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Pointer analysis},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=596982&ftid=103430&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:35:41>},
}

@article{Perrott:1979:LAV:357073.357075,
  author =	 {Perrott, R. H.},
  title =	 {A Language for Array and Vector Processors},
  journal =	 {ACM Trans. Program. Lang. Syst.},
  issue_date =	 {Oct. 1979},
  volume =	 1,
  number =	 2,
  month =	 oct,
  year =	 1979,
  issn =	 {0164-0925},
  pages =	 {177--195},
  numpages =	 19,
  url =		 {http://doi.acm.org/10.1145/357073.357075},
  doi =		 {10.1145/357073.357075},
  acmid =	 357075,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=357075&ftid=51974&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Perrott_1979_A-Language.pdf},
  notes = 	 {High-level languages for parallel programming.},
  review = 	 {fbie: accepted <2016-01-13 13:37:16>},
}

@inproceedings{So:2004:CDL:977395.977674,
  author =	 {So, Byoungro and Hall, Mary W. and Ziegler, Heidi
                  E.},
  title =	 {Custom Data Layout for Memory Parallelism},
  booktitle =	 {Proceedings of the International Symposium on Code
                  Generation and Optimization: Feedback-directed and
                  Runtime Optimization},
  series =	 {CGO '04},
  year =	 2004,
  isbn =	 {0-7695-2102-9},
  location =	 {Palo Alto, California},
  pages =	 {291--},
  url =		 {http://dl.acm.org/citation.cfm?id=977395.977674},
  acmid =	 977674,
  publisher =	 {IEEE Computer Society},
  address =	 {Washington, DC, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=977674&ftid=276226&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/So_2004_Custom-Data.pdf},
  notes = 	 {This might be relevant for avoiding false-sharing issues, but seems rather far off.},
  review = 	 {fbie: rejected <2016-01-13 13:39:05>},
}

@article{Bergmann:2013:QDR:2435227.2435259,
  author =	 {Bergmann, Neil W. and Shukla, Sunil K. and Becker,
                  J\"{u}rgen},
  title =	 {QUKU: A Dual-layer Reconfigurable Architecture},
  journal =	 {ACM Trans. Embed. Comput. Syst.},
  issue_date =	 {March 2013},
  volume =	 12,
  number =	 {1s},
  month =	 mar,
  year =	 2013,
  issn =	 {1539-9087},
  pages =	 {63:1--63:26},
  articleno =	 63,
  numpages =	 26,
  url =		 {http://doi.acm.org/10.1145/2435227.2435259},
  doi =		 {10.1145/2435227.2435259},
  acmid =	 2435259,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {CGRAs, FPGAs, Field programmable gate arrays, coarse
                  grain reconfigurable arrays, multimedia computing},
  abstract = 	 {A new architecture, QUKU, is proposed for implementing stream-based algorithms on FPGAs, which combines the advantages of FPGA and Coarse Grain Reconfigurable Arrays (CGRAs). QUKU consists of a dynamically reconfigurable, coarse-grain Processing Element (PE) array with an associated softcore processor providing system support. At a coarse-grain, the PE array can be reconfigured on a cycle-by-cycle basis to change the PE functionality similarly to that in a conventional CGRA. At a fine-grain, the whole FPGA can be reconfigured statically to implement a completely different PE array that serves the target application in a better way. Advantages of the fine-grain reconfiguration include individually customized PEs, adaptable numeric format support and customizable interconnect network. A prototype CAD tool framework is also developed which facilitates programming the QUKU architecture. An example application consisting of two different image detectors is implemented to demonstrate the advantages of QUKU. QUKU provides up to 140 times speedup and 40 times improvement in area-time product compared to an implementation running on an FPGA-based softcore. The area-time product for QUKU is around 16&percnt; lower than that of a custom circuit based implementation on the same FPGA. The per-PE customization provides an area-time saving of approximately 31&percnt; compared to a homogeneous 4 &times; 4 array of PEs for the same application. The experimental results demonstrate that a dual layered reconfigurable architecture provides significant potential benefits in terms of flexibility, area and processing efficiency over existing reconfigurable computing architectures for DSP.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2435259&ftid=1360257&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:39:13>},
}

@inproceedings{Ono:1989:NSP:318789.318831,
  author =	 {Ono, Sadayasu and Ohta, Naohisa},
  title =	 {The NOV-II Super Parallel Computer for Signal
                  Processing},
  booktitle =	 {Proceedings of the 3rd International Conference on
                  Supercomputing},
  series =	 {ICS '89},
  year =	 1989,
  isbn =	 {0-89791-309-4},
  location =	 {Crete, Greece},
  pages =	 {381--390},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/318789.318831},
  doi =		 {10.1145/318789.318831},
  acmid =	 318831,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  review = 	 {fbie: rejected <2016-01-13 13:39:47>},
}

@inproceedings{Lowney:1981:CAI:567532.567533,
  author =	 {Lowney, P. Geoffrey},
  title =	 {Carrier Arrays: An Idiom-preserving Extension to
                  APL},
  booktitle =	 {Proceedings of the 8th ACM SIGPLAN-SIGACT Symposium
                  on Principles of Programming Languages},
  series =	 {POPL '81},
  year =	 1981,
  isbn =	 {0-89791-029-X},
  location =	 {Williamsburg, Virginia},
  pages =	 {1--13},
  numpages =	 13,
  url =		 {http://doi.acm.org/10.1145/567532.567533},
  doi =		 {10.1145/567532.567533},
  acmid =	 567533,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=567533&ftid=84193&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Lowney_1981_Carrier-Arrays.pdf},
  notes = 	 {"A carrier array is a ragged array with an associated partition which allows functions to be applied to subarrays in parallel."},
  review = 	 {fbie: accepted <2016-01-13 13:41:28>},
}

@inproceedings{Sanati-Mehrizy:1989:IEF:75427.1030222,
  author =	 {Sanati-Mehrizy, Reza and Thompson, John C.},
  title =	 {Implementation of Extended Functional Programming
                  Language on Cellular Tree and Data Flow
                  Architecture},
  booktitle =	 {Proceedings of the 17th Conference on ACM Annual
                  Computer Science Conference},
  series =	 {CSC '89},
  year =	 1989,
  isbn =	 {0-89791-299-3},
  location =	 {Louisville, Kentucky},
  pages =	 {402--402},
  numpages =	 1,
  url =		 {http://doi.acm.org/10.1145/75427.1030222},
  doi =		 {10.1145/75427.1030222},
  acmid =	 1030222,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {Extended functional programming language (EFP) is an extended version of FP[BJ78] which can simply manipulate the operations on syntactically represented data structures[SR87]. In EFP a complex operation such as insertion of node in a binary tree will be expressed in a set of parallel operations which are much simpler. Both CTA (cellular tree architecture) [MG80] and DFA (Data Flow Architecture) [RM83] can be used to implement the operations on data structures(in particular) in parallel. CTA which has a full binary tree structure contains an array of independent cells which are the leaves of the tree. Each cell will be involved with only one symbol of the syntactically represented data structure. Since these cells are independent, they can perform the operations on these symbols in parallel. So in CTA, we get parallelism within the operations as well as between the operations. For DFA, each complex operation will be converted to a sequence of primitive operations which may be performed in parallel. This sequence of operations will be represented by a basic block of instructions[RM83]. All the instructions with the number of dependencies equal to zero can be performed in parallel(if there are enough processors). Each instruction represents a primitive operation. Only one processor is involved with the execution of each instruction. So there is no parallelism within the operation but only between the operations. In [SR87] these two computer architectures have been briefly described and by giving some examples, it has been shown how a specific function can be implemented on these computer architectures. At the end, these two computer architectures have been compared and contrasted by discussing some advantages and disadvantages of each method.},
  review = 	 {fbie: rejected <2016-01-13 13:43:22>},
}

@inproceedings{Rose:2011:RFC:1950413.1950415,
  author =	 {Rose, Jonathan and Lemieux, Guy},
  title =	 {The Role of FPGAs in a Converged Future with
                  Heterogeneous Programmable Processors:
                  Pre-conference Workshop},
  booktitle =	 {Proceedings of the 19th ACM/SIGDA International
                  Symposium on Field Programmable Gate Arrays},
  series =	 {FPGA '11},
  year =	 2011,
  isbn =	 {978-1-4503-0554-9},
  location =	 {Monterey, CA, USA},
  pages =	 {1--2},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/1950413.1950415},
  doi =		 {10.1145/1950413.1950415},
  acmid =	 1950415,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {dsp, fpga, heterogeneity, heterogeneous computing
                  systems, high level programming models,
                  reconfigurable computing, signal processing},
  abstract = 	 {The battle of fixed function devices vs. programmable devices has been won by the programmables. The question facing us now is to determine what kinds of programmability to place on next generation systems/devices. Research and development on many applications has shown that different kinds of hardware and software programmability succeed for different application classes: powerful, singlethread-optimized CPUs continue to do very well for many applications; the General Purpose GPU is carving a niche in high throughput, parallel floating point codes in addition to its home turf of graphics; the FPGA is particularly good at variable bit-size computations and data steering, as well as parallel distributed control of networks. Future systems may well need all three types of these types of engines, and perhaps interesting mixtures of them. This is particularly true when we deal with the combined goals of optimizing cost, performance and energy. In this workshop, we will look to the future of the FPGA within these types of 'converged' programmable computing engines, and reflectively ask ourselves: What role can the FPGA play? What future applications in areas such as networking, mobile, and artificial intelligence can be driven by FPGAs? How do FPGAs fit into the architecture realm of CPUs, general purpose GPUs, and DSPs? How should the designer/programme express their intent in the most effective way possible? What are the requirements for a compilation and optimization environment that allow FPGAs to intermix within a heterogeneous and converged future? The intent of this workshop is to open a discussion on these questions. There will be a series of short, invited talks interspersed with free and open discussion.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=1950415&ftid=526096&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:43:28>},
}

@inproceedings{Luckas:1988:PEF:55364.55400,
  author =	 {Luckas, M.},
  title =	 {Performance Evaluation of a Formally
                  Supercomputer-based Monte Carlo Program on a T800
                  Transputer Network},
  booktitle =	 {Proceedings of the 2Nd International Conference on
                  Supercomputing},
  series =	 {ICS '88},
  year =	 1988,
  isbn =	 {0-89791-272-1},
  location =	 {St. Malo, France},
  pages =	 {367--374},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/55364.55400},
  doi =		 {10.1145/55364.55400},
  acmid =	 55400,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  review = 	 {fbie: rejected <2016-01-13 13:43:38>},
}

@article{Hall:1994:UHT:182590.156781,
  author =	 {Hall, Cordelia V.},
  title =	 {Using Hindley-Milner Type Inference to Optimise List
                  Representation},
  journal =	 {SIGPLAN Lisp Pointers},
  issue_date =	 {July-Sept. 1994},
  volume =	 {VII},
  number =	 3,
  month =	 jul,
  year =	 1994,
  issn =	 {1045-3563},
  pages =	 {162--172},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/182590.156781},
  doi =		 {10.1145/182590.156781},
  acmid =	 156781,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=156781&ftid=25751&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Hall_1994_Using-Hindley.pdf},
  review = 	 {fbie: accepted <2016-01-13 13:45:08>},
}

@inproceedings{Hall:1994:UHT:182409.156781,
  author =	 {Hall, Cordelia V.},
  title =	 {Using Hindley-Milner Type Inference to Optimise List
                  Representation},
  booktitle =	 {Proceedings of the 1994 ACM Conference on LISP and
                  Functional Programming},
  series =	 {LFP '94},
  year =	 1994,
  isbn =	 {0-89791-643-3},
  location =	 {Orlando, Florida, USA},
  pages =	 {162--172},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/182409.156781},
  doi =		 {10.1145/182409.156781},
  acmid =	 156781,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=156781&ftid=25751&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: accepted <2016-01-13 13:45:12>},
}

@inproceedings{deDinechin:2014:TCS:2616606.2616725,
  author =	 {de Dinechin, Beno\^{\i}t Dupont and van Amstel, Duco
                  and Poulhi\`{e}s, Marc and Lager, Guillaume},
  title =	 {Time-critical Computing on a Single-chip Massively
                  Parallel Processor},
  booktitle =	 {Proceedings of the Conference on Design, Automation
                  \& Test in Europe},
  series =	 {DATE '14},
  year =	 2014,
  isbn =	 {978-3-9815370-2-4},
  location =	 {Dresden, Germany},
  pages =	 {97:1--97:6},
  articleno =	 97,
  numpages =	 6,
  url =		 {http://dl.acm.org/citation.cfm?id=2616606.2616725},
  acmid =	 2616725,
  publisher =	 {European Design and Automation Association},
  address =	 {3001 Leuven, Belgium, Belgium},
  abstract = 	 {The requirement of high performance computing at low power can be met by the parallel execution of an application on a possibly large number of programmable cores. However, the lack of accurate timing properties may prevent parallel execution from being applicable to time-critical applications. We illustrate how this problem has been addressed by suitably designing the architecture, implementation, and programming model, of the Kalray MPPA&reg;-256 single-chip many-core processor. The MPPA&reg;-256 (Multi-Purpose Processing Array) processor integrates 256 processing engine (PE) cores and 32 resource management (RM) cores on a single 28nm CMOS chip. These VLIW cores are distributed across 16 compute clusters and 4 I/O subsystems, each with a locally shared memory. On-chip communication and synchronization are supported by an explicitly addressed dual network-on-chip (NoC), with one node per compute cluster and 4 nodes per I/O subsystem. Off-chip interfaces include DDR, PCI and Ethernet, and a direct access to the NoC for low-latency processing of data streams. The key architectural features that support time-critical applications are timing compositional cores, independent memory banks inside the compute clusters, and the data NoC whose guaranteed services are determined by network calculus. The programming model provides communicators that effectively support distributed computing primitives such as remote writes, barrier synchronizations, active messages, and communication by sampling. POSIX time functions expose synchronous clocks inside compute clusters and mesosynchronous clocks across the MPPA&reg;-256 processor.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2616725&ftid=1456754&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:46:04>},
}

@inproceedings{Nakao:2012:PPG:2310096.2310220,
  author =	 {Nakao, Masahiro and Lee, Jinpil and Boku, Taisuke
                  and Sato, Mitsuhisa},
  title =	 {Productivity and Performance of Global-View
                  Programming with XcalableMP PGAS Language},
  booktitle =	 {Proceedings of the 2012 12th IEEE/ACM International
                  Symposium on Cluster, Cloud and Grid Computing
                  (Ccgrid 2012)},
  series =	 {CCGRID '12},
  year =	 2012,
  isbn =	 {978-0-7695-4691-9},
  pages =	 {402--409},
  numpages =	 8,
  url =		 {http://dx.doi.org/10.1109/CCGrid.2012.118},
  doi =		 {10.1109/CCGrid.2012.118},
  acmid =	 2310220,
  publisher =	 {IEEE Computer Society},
  address =	 {Washington, DC, USA},
  keywords =	 {PGAS language, global-view model, productivity,
                  performance evaluation},
  review = 	 {fbie: rejected <2016-01-13 13:47:19>},
}

@article{Chang:1990:IBN:99164.99176,
  author =	 {Chang, P. S. and Egan, G. K.},
  title =	 {An Implementation of a Barotropic Numerical Weather
                  Prediction Model in the Functional Language SISAL},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {Mar. 1990},
  volume =	 25,
  number =	 3,
  month =	 feb,
  year =	 1990,
  issn =	 {0362-1340},
  pages =	 {109--117},
  numpages =	 9,
  url =		 {http://doi.acm.org/10.1145/99164.99176},
  doi =		 {10.1145/99164.99176},
  acmid =	 99176,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=99176&ftid=1848&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:47:29>},
}

@inproceedings{Chang:1990:IBN:99163.99176,
  author =	 {Chang, P. S. and Egan, G. K.},
  title =	 {An Implementation of a Barotropic Numerical Weather
                  Prediction Model in the Functional Language SISAL},
  booktitle =	 {Proceedings of the Second ACM SIGPLAN Symposium on
                  Principles \&Amp; Practice of Parallel Programming},
  series =	 {PPOPP '90},
  year =	 1990,
  isbn =	 {0-89791-350-7},
  location =	 {Seattle, Washington, USA},
  pages =	 {109--117},
  numpages =	 9,
  url =		 {http://doi.acm.org/10.1145/99163.99176},
  doi =		 {10.1145/99163.99176},
  acmid =	 99176,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=99176&ftid=1848&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:47:33>},
}

@article{Anderson:1990:CHA:93548.93561,
  author =	 {Anderson, Steven and Hudak, Paul},
  title =	 {Compilation of Haskell Array Comprehensions for
                  Scientific Computing},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {Jun. 1990},
  volume =	 25,
  number =	 6,
  month =	 jun,
  year =	 1990,
  issn =	 {0362-1340},
  pages =	 {137--149},
  numpages =	 13,
  url =		 {http://doi.acm.org/10.1145/93548.93561},
  doi =		 {10.1145/93548.93561},
  acmid =	 93561,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=93561&ftid=14811&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Anderson_1990_Compilation-of.pdf},
  notes = 	 {Analysis of inter-data dependencies in array comprehensions.},
  review = 	 {fbie: accepted <2016-01-13 13:50:06>},
}

@inproceedings{Anderson:1990:CHA:93542.93561,
  author =	 {Anderson, Steven and Hudak, Paul},
  title =	 {Compilation of Haskell Array Comprehensions for
                  Scientific Computing},
  booktitle =	 {Proceedings of the ACM SIGPLAN 1990 Conference on
                  Programming Language Design and Implementation},
  series =	 {PLDI '90},
  year =	 1990,
  isbn =	 {0-89791-364-7},
  location =	 {White Plains, New York, USA},
  pages =	 {137--149},
  numpages =	 13,
  url =		 {http://doi.acm.org/10.1145/93542.93561},
  doi =		 {10.1145/93542.93561},
  acmid =	 93561,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=93561&ftid=14811&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: accepted <2016-01-13 13:50:11>},
}

@article{Hudak:1989:CEA:72551.72554,
  author =	 {Hudak, Paul},
  title =	 {Conception, Evolution, and Application of Functional
                  Programming Languages},
  journal =	 {ACM Comput. Surv.},
  issue_date =	 {Sep. 1989},
  volume =	 21,
  number =	 3,
  month =	 sep,
  year =	 1989,
  issn =	 {0360-0300},
  pages =	 {359--411},
  numpages =	 53,
  url =		 {http://doi.acm.org/10.1145/72551.72554},
  doi =		 {10.1145/72551.72554},
  acmid =	 72554,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=72554&ftid=8921&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Hudak_1989_Conception-.pdf},
  notes = 	 {Review paper.},
  review = 	 {fbie: rejected <2016-01-13 13:51:49>},
}

@inproceedings{Kwan:1994:PCS:181181.181332,
  author =	 {Kwan, Thomas T. and Reed, Daniel A.},
  title =	 {Performance of the CM-5 Scalable File System},
  booktitle =	 {Proceedings of the 8th International Conference on
                  Supercomputing},
  series =	 {ICS '94},
  year =	 1994,
  isbn =	 {0-89791-665-4},
  location =	 {Manchester, England},
  pages =	 {156--165},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/181181.181332},
  doi =		 {10.1145/181181.181332},
  acmid =	 181332,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=181332&ftid=37874&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:51:58>},
}

@inproceedings{Jouvelot:1987:SPN:322917.323086,
  author =	 {Jouvelot, P.},
  title =	 {Semantic Parallelization (Abstract Only): A
                  Non-standard Denotational Approach for Imperative
                  Programs Parallelization},
  booktitle =	 {Proceedings of the 15th Annual Conference on
                  Computer Science},
  series =	 {CSC '87},
  year =	 1987,
  isbn =	 {0-89791-218-7},
  location =	 {St. Louis, Missouri, USA},
  pages =	 {425--},
  url =		 {http://doi.acm.org/10.1145/322917.323086},
  doi =		 {10.1145/322917.323086},
  acmid =	 323086,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=323086&ftid=8720&dwn=1&CFID=744742186&CFTOKEN=77967707},
  notes = 	 {Imperative programs.},
  review = 	 {fbie: rejected <2016-01-13 13:52:32>},
}

@inproceedings{Hoshino:2014:OED:2691158.2691160,
  author =	 {Hoshino, Tetsuya and Maruyama, Naoya and Matsuoka,
                  Satoshi},
  title =	 {An OpenACC Extension for Data Layout Transformation},
  booktitle =	 {Proceedings of the First Workshop on Accelerator
                  Programming Using Directives},
  series =	 {WACCPD '14},
  year =	 2014,
  isbn =	 {978-1-4799-7023-0},
  location =	 {New Orleans, Louisiana},
  pages =	 {12--18},
  numpages =	 7,
  url =		 {http://dx.doi.org/10.1109/WACCPD.2014.12},
  doi =		 {10.1109/WACCPD.2014.12},
  acmid =	 2691160,
  publisher =	 {IEEE Press},
  address =	 {Piscataway, NJ, USA},
  keywords =	 {OpenACC, data layout, performance portability},
  review = 	 {fbie: rejected <2016-01-13 13:52:40>},
}

@inproceedings{Carlile:1988:SAC:62297.62365,
  author =	 {Carlile, B. R. and Miles, D.},
  title =	 {Structured Asynchronous Communication Routines for
                  the FPS T-series},
  booktitle =	 {Proceedings of the Third Conference on Hypercube
                  Concurrent Computers and Applications: Architecture,
                  Software, Computer Systems, and General Issues -
                  Volume 1},
  series =	 {C<sup>3</sup>P},
  year =	 1988,
  isbn =	 {0-89791-278-0},
  location =	 {Pasadena, California, USA},
  pages =	 {550--559},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/62297.62365},
  doi =		 {10.1145/62297.62365},
  acmid =	 62365,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=62365&ftid=10134&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Carlile_1988_Structured-Asynchronous.pdf},
  review = 	 {fbie: rejected <2016-01-13 13:52:58>},
}

@inproceedings{Bose:1988:IPI:55364.55376,
  author =	 {Bose, P.},
  title =	 {Interactive Program Improvement via EAVE: An Expert
                  Adviser for Vectorization},
  booktitle =	 {Proceedings of the 2Nd International Conference on
                  Supercomputing},
  series =	 {ICS '88},
  year =	 1988,
  isbn =	 {0-89791-272-1},
  location =	 {St. Malo, France},
  pages =	 {119--130},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/55364.55376},
  doi =		 {10.1145/55364.55376},
  acmid =	 55376,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  review = 	 {fbie: rejected <2016-01-13 13:54:52>},
}

@inproceedings{Chakrabarti:1996:GCA:231379.231391,
  author =	 {Chakrabarti, Soumen and Gupta, Manish and Choi,
                  Jong-Deok},
  title =	 {Global Communication Analysis and Optimization},
  booktitle =	 {Proceedings of the ACM SIGPLAN 1996 Conference on
                  Programming Language Design and Implementation},
  series =	 {PLDI '96},
  year =	 1996,
  isbn =	 {0-89791-795-2},
  location =	 {Philadelphia, Pennsylvania, USA},
  pages =	 {68--78},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/231379.231391},
  doi =		 {10.1145/231379.231391},
  acmid =	 231391,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=231391&ftid=34756&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Chakrabarti_1996_Global-Communication.pdf},
  notes = 	 {Explicit communication between machines.},
  review = 	 {fbie: rejected <2016-01-13 13:56:18>},
}

@article{Chakrabarti:1996:GCA:249069.231391,
  author =	 {Chakrabarti, Soumen and Gupta, Manish and Choi,
                  Jong-Deok},
  title =	 {Global Communication Analysis and Optimization},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {May 1996},
  volume =	 31,
  number =	 5,
  month =	 may,
  year =	 1996,
  issn =	 {0362-1340},
  pages =	 {68--78},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/249069.231391},
  doi =		 {10.1145/249069.231391},
  acmid =	 231391,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=231391&ftid=34756&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 13:56:30>},
}

@inproceedings{Frigo:2001:ESC:360276.360326,
  author =	 {Frigo, Jan and Gokhale, Maya and Lavenier,
                  Dominique},
  title =	 {Evaluation of the streams-C C-to-FPGA Compiler: An
                  Applications Perspective},
  booktitle =	 {Proceedings of the 2001 ACM/SIGDA Ninth
                  International Symposium on Field Programmable Gate
                  Arrays},
  series =	 {FPGA '01},
  year =	 2001,
  isbn =	 {1-58113-341-3},
  location =	 {Monterey, California, USA},
  pages =	 {134--140},
  numpages =	 7,
  url =		 {http://doi.acm.org/10.1145/360276.360326},
  doi =		 {10.1145/360276.360326},
  acmid =	 360326,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {FPGA, FPGA design tools, configurable computing,
                  hardware-software co-design, high-level synthesis,
                  silicon compiler},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=360326&ftid=46623&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Frigo_2001_Evaluation-of.pdf},
  review = 	 {fbie: rejected <2016-01-13 13:57:12>},
}

@inproceedings{Jeffrey:2011:UBF:1989493.1989551,
  author =	 {Jeffrey, Mark C. and Steffan, J. Gregory},
  title =	 {Understanding Bloom Filter Intersection for Lazy
                  Address-set Disambiguation},
  booktitle =	 {Proceedings of the Twenty-third Annual ACM Symposium
                  on Parallelism in Algorithms and Architectures},
  series =	 {SPAA '11},
  year =	 2011,
  isbn =	 {978-1-4503-0743-7},
  location =	 {San Jose, California, USA},
  pages =	 {345--354},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/1989493.1989551},
  doi =		 {10.1145/1989493.1989551},
  acmid =	 1989551,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {address-set disambiguation, bloom filters,
                  parallelism, set intersection, signatures,
                  thread-level speculation, transactional memory},
  abstract = 	 {A Bloom filter is a probabilistic bit-array-based set representation that has recently been applied to address-set disambiguation in systems that ease the burden of parallel programming. However, many of these systems intersect the Bloom filter bit-arrays to approximate address-set intersection and decide set disjointness. This is in contrast with the conventional and well-studied approach of making individual membership queries into the Bloom filter. In this paper we present much-needed probabilistic models for the unconventional application of testing set disjointness using Bloom filters. Consequently, we demonstrate that intersecting Bloom filters requires substantially larger bit-arrays to provide the same probability of false set-overlap as querying into the bit-array. For when intersection is unavoidable, we prove that partitioned Bloom filters require less space than unpartitioned. Finally, we show that for Bloom filters with a single hash function, surprisingly, intersection and querying share the same probability of false set-overlap.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=1989551&ftid=982155&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 14:06:28>},
}

@article{Jian:2000:HFP:352183.352203,
  author =	 {Jian, Lu and Yingjun, Li and Xiaoxing, Ma and Min,
                  Cai and Xianping, Tao and Guanqun, Zhang and
                  Jianzhong, Liu},
  title =	 {A Hierarchical Framework for Parallel Seismic
                  Applications},
  journal =	 {Commun. ACM},
  issue_date =	 {Oct. 2000},
  volume =	 43,
  number =	 10,
  month =	 oct,
  year =	 2000,
  issn =	 {0001-0782},
  pages =	 {55--59},
  numpages =	 5,
  url =		 {http://doi.acm.org/10.1145/352183.352203},
  doi =		 {10.1145/352183.352203},
  acmid =	 352203,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=352203&ftid=3772&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Jian_2000_A-Hierarchical.pdf},
  review = 	 {fbie: rejected <2016-01-13 14:07:41>},
}

@inproceedings{Willhoft:1991:CFP:114054.114094,
  author =	 {Willhoft, Robert G.},
  title =	 {Comparison of the Functional Power of APL2 and
                  FORTRAN 90},
  booktitle =	 {Proceedings of the International Conference on APL
                  '91},
  series =	 {APL '91},
  year =	 1991,
  isbn =	 {0-89791-441-4},
  location =	 {Palo Alto, California, USA},
  pages =	 {343--357},
  numpages =	 15,
  url =		 {http://doi.acm.org/10.1145/114054.114094},
  doi =		 {10.1145/114054.114094},
  acmid =	 114094,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=114094&ftid=28903&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Willhoft_1991_Comparison-of.pdf},
  notes = 	 {Very much a review paper.},
  review = 	 {fbie: rejected <2016-01-13 14:08:34>},
}

@article{Willhoft:1991:CFP:114055.114094,
  author =	 {Willhoft, Robert G.},
  title =	 {Comparison of the Functional Power of APL2 and
                  FORTRAN 90},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {Aug. 1991},
  volume =	 21,
  number =	 4,
  month =	 jul,
  year =	 1991,
  issn =	 {0163-6006},
  pages =	 {343--357},
  numpages =	 15,
  url =		 {http://doi.acm.org/10.1145/114055.114094},
  doi =		 {10.1145/114055.114094},
  acmid =	 114094,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=114094&ftid=28903&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 14:08:39>},
}

@article{Hall:1993:EUP:173284.155336,
  author =	 {Hall, Mary W. and Harvey, Timothy J. and Kennedy,
                  Ken and McIntosh, Nathaniel and McKinley, Kathryn
                  S. and Oldham, Jeffrey D. and Paleczny, Michael
                  H. and Roth, Gerald},
  title =	 {Experiences Using the ParaScope Editor: An
                  Interactive Parallel Programming Tool},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {July 1993},
  volume =	 28,
  number =	 7,
  month =	 jul,
  year =	 1993,
  issn =	 {0362-1340},
  pages =	 {33--43},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/173284.155336},
  doi =		 {10.1145/173284.155336},
  acmid =	 155336,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=155336&ftid=28671&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Hall_1993_Experiences-Using.pdf},
  notes = 	 {More of an IDE.},
  review = 	 {fbie: rejected <2016-01-13 14:09:09>},
  abstract = 	 {The ParaScope Editor is an interactive parallel programming tool that assists knowledgeable users in developing scientific Fortran programs. It displays the results of sophisticated program analyses, provides a set of powerful interactive transformations, and supports program editing. This paper summarizes experiences of scientific programmers and tool designers using the ParaScope Editor. We evaluate existing features and describe enhancements in three key areas: user interface, analysis, and transformation. many existing features prove crucial to successful program parallelization. They include interprocedural array side-effect analysis and program and dependence view filtering. Desirable functionality includes improved program navigation based on performance estimation, incorporating user assertions in analysis and more guidance in selecting transformations. These results offer insights for the authors of a variety of programming tools and parallelizing compilers.},
}

@inproceedings{Hall:1993:EUP:155332.155336,
  author =	 {Hall, Mary W. and Harvey, Timothy J. and Kennedy,
                  Ken and McIntosh, Nathaniel and McKinley, Kathryn
                  S. and Oldham, Jeffrey D. and Paleczny, Michael
                  H. and Roth, Gerald},
  title =	 {Experiences Using the ParaScope Editor: An
                  Interactive Parallel Programming Tool},
  booktitle =	 {Proceedings of the Fourth ACM SIGPLAN Symposium on
                  Principles and Practice of Parallel Programming},
  series =	 {PPOPP '93},
  year =	 1993,
  isbn =	 {0-89791-589-5},
  location =	 {San Diego, California, USA},
  pages =	 {33--43},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/155332.155336},
  doi =		 {10.1145/155332.155336},
  acmid =	 155336,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=155336&ftid=28671&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 14:09:16>},
  abstract = 	 {The ParaScope Editor is an interactive parallel programming tool that assists knowledgeable users in developing scientific Fortran programs. It displays the results of sophisticated program analyses, provides a set of powerful interactive transformations, and supports program editing. This paper summarizes experiences of scientific programmers and tool designers using the ParaScope Editor. We evaluate existing features and describe enhancements in three key areas: user interface, analysis, and transformation. many existing features prove crucial to successful program parallelization. They include interprocedural array side-effect analysis and program and dependence view filtering. Desirable functionality includes improved program navigation based on performance estimation, incorporating user assertions in analysis and more guidance in selecting transformations. These results offer insights for the authors of a variety of programming tools and parallelizing compilers.},
}

@inproceedings{Henriksen:2014:SSH:2636228.2636238,
  author =	 {Henriksen, Troels and Elsman, Martin and Oancea,
                  Cosmin E.},
  title =	 {Size Slicing: A Hybrid Approach to Size Inference in
                  Futhark},
  booktitle =	 {Proceedings of the 3rd ACM SIGPLAN Workshop on
                  Functional High-performance Computing},
  series =	 {FHPC '14},
  year =	 2014,
  isbn =	 {978-1-4503-3040-4},
  location =	 {Gothenburg, Sweden},
  pages =	 {31--42},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2636228.2636238},
  doi =		 {10.1145/2636228.2636238},
  acmid =	 2636238,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {dependent types, functional language, size analysis},
  review = 	 {fbie: rejected <2016-01-13 14:09:29>},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2636238&ftid=1498027&dwn=1&CFID=744754184&CFTOKEN=77672816},
  abstract = 	 {We present a shape inference analysis for a purely-functional language, named Futhark, that supports nested parallelism via array combinators such as map, reduce, filter, and scan. Our approach is to infer code for computing precise shape information at run-time, which in the most common cases can be effectively optimized by standard compiler optimizations. Instead of restricting the language or sacrificing ease of use, the language allows the occasional shape-dynamic, and even shape-misbehaving, constructs. Inherently shape-dynamic code is treated with a fall-back technique that preserves, asymptotically, the number of operations of the program and that computes and returns the array's shape alongside with its value. This approach leads to a shape-dependent system with existentially-quantified types, where static shape inference corresponds to eliminating existential quantifications from the types of program expressions. We optimize the common case to negligible overhead via size slicing: a technique that separates the computation of the array's shape from its values. This allows the shape to be calculated in advance and to be used to instantiate the previously existentially-quantified shapes of the value slice. We report negligible overhead, on several mini-benchmarks and three real-world applications.},
}

@inproceedings{Yamaguchi:2011:CFG:1950413.1950476,
  author =	 {Yamaguchi, Yoshiki and Tsoi, Kuen Hung and Luk,
                  Wayne},
  title =	 {A Comparison of FPGAs, GPUS and CPUS for
                  Smith-Waterman Algorithm (Abstract Only)},
  booktitle =	 {Proceedings of the 19th ACM/SIGDA International
                  Symposium on Field Programmable Gate Arrays},
  series =	 {FPGA '11},
  year =	 2011,
  isbn =	 {978-1-4503-0554-9},
  location =	 {Monterey, CA, USA},
  pages =	 {281--281},
  numpages =	 1,
  url =		 {http://doi.acm.org/10.1145/1950413.1950476},
  doi =		 {10.1145/1950413.1950476},
  acmid =	 1950476,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {fpga, gpgpu, performance comparison, sequence
                  alignment, smith-waterman algorithm},
  abstract = 	 {The Smith-Waterman algorithm is a key technique for comparing genetic sequences. This paper presents a comprehensive study of a systolic design for Smith-Waterman algorithm. It is parameterized in terms of the sequence length, the amount of parallelism, and the number of FPGAs. Two methods of organizing the parallelism, the line-based and the lattice-based methods, are introduced. Our analytical treatment reveals how these two methods perform relative to peak performance when the level of parallelism varies. A novel systolic design is then described, showing how the parametric description can be effectively implemented, with specific focus on enhancing parallelism and on optimizing the total size of memory and circuits; in particular, we develop efficient realizations for compressing score matrices and for reducing affine gap cost functions. Promising results have been achieved showing, for example, a single XC5VLX330 FPGA at 131MHz can be three times faster than a platform with two NVIDIA GTX295 at 1242MHz.},
  review = 	 {fbie: rejected <2016-01-13 14:09:36>},
}

@article{Blelloch:1996:PTS:232629.232650,
  author =	 {Blelloch, Guy E. and Greiner, John},
  title =	 {A Provable Time and Space Efficient Implementation
                  of NESL},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {June 15, 1996},
  volume =	 31,
  number =	 6,
  month =	 jun,
  year =	 1996,
  issn =	 {0362-1340},
  pages =	 {213--225},
  numpages =	 13,
  url =		 {http://doi.acm.org/10.1145/232629.232650},
  doi =		 {10.1145/232629.232650},
  acmid =	 232650,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=232650&ftid=38674&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: accepted <2016-01-13 14:09:45>},
  abstract = 	 {In this paper we prove time and space bounds for the implementation of the programming language NESL on various parallel machine models. NESL is a sugared typed &amp;lambda;-calculus with a set of array primitives and an explicit parallel map over arrays. Our results extend previous work on provable implementation bounds for functional languages by considering space and by including arrays. For modeling the cost of NESL we augment a standard call-by-value operational semantics to return two cost measures: a DAG representing the sequential dependence in the computation, and a measure of the space taken by a sequential implementation. We show that a NESL program with w work (nodes in the DAG), d depth (levels in the DAG), and s sequential space can be implemented on a p processor butterfly network, hypercube, or CRCW PRAM using O(w/p + d log p) time and O(s + dp log p) reachable space.1 For programs with sufficient parallelism these bounds are optimal in that they give linear speedup and use space within a constant factor of the sequential space.},
}

@inproceedings{Blelloch:1996:PTS:232627.232650,
  author =	 {Blelloch, Guy E. and Greiner, John},
  title =	 {A Provable Time and Space Efficient Implementation
                  of NESL},
  booktitle =	 {Proceedings of the First ACM SIGPLAN International
                  Conference on Functional Programming},
  series =	 {ICFP '96},
  year =	 1996,
  isbn =	 {0-89791-770-7},
  location =	 {Philadelphia, Pennsylvania, USA},
  pages =	 {213--225},
  numpages =	 13,
  url =		 {http://doi.acm.org/10.1145/232627.232650},
  doi =		 {10.1145/232627.232650},
  acmid =	 232650,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=232650&ftid=38674&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: accepted <2016-01-13 14:09:49>},
  abstract = 	 {In this paper we prove time and space bounds for the implementation of the programming language NESL on various parallel machine models. NESL is a sugared typed &amp;lambda;-calculus with a set of array primitives and an explicit parallel map over arrays. Our results extend previous work on provable implementation bounds for functional languages by considering space and by including arrays. For modeling the cost of NESL we augment a standard call-by-value operational semantics to return two cost measures: a DAG representing the sequential dependence in the computation, and a measure of the space taken by a sequential implementation. We show that a NESL program with w work (nodes in the DAG), d depth (levels in the DAG), and s sequential space can be implemented on a p processor butterfly network, hypercube, or CRCW PRAM using O(w/p + d log p) time and O(s + dp log p) reachable space.1 For programs with sufficient parallelism these bounds are optimal in that they give linear speedup and use space within a constant factor of the sequential space.},
}

@inproceedings{Wise:2001:LSM:379539.379559,
  author =	 {Wise, David S. and Frens, Jeremy D. and Gu, Yuhong
                  and Alexander, Gregory A.},
  title =	 {Language Support for Morton-order Matrices},
  booktitle =	 {Proceedings of the Eighth ACM SIGPLAN Symposium on
                  Principles and Practices of Parallel Programming},
  series =	 {PPoPP '01},
  year =	 2001,
  isbn =	 {1-58113-346-4},
  location =	 {Snowbird, Utah, USA},
  pages =	 {24--33},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/379539.379559},
  doi =		 {10.1145/379539.379559},
  acmid =	 379559,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {paging, quadtrees},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=379559&ftid=65671&dwn=1&CFID=744742186&CFTOKEN=77967707},
  fullTextFile = {.slirm_cache/Wise_2001_Language-Support.pdf},
  review = 	 {fbie: rejected <2016-01-13 14:11:15>},
  abstract = 	 {The uniform representation of 2-dimensional arrays serially in Morton order (or {\eee} order) supports both their iterative scan with cartesian indices and their divide-and-conquer manipulation as quaternary trees. This data structure is important because it relaxes serious problems of locality and latency, and the tree helps to schedule multi-processing. Results here show how it facilitates algorithms that avoid cache misses and page faults at all levels in hierarchical memory, independently of a specific runtime environment.We have built a rudimentary C-to-C translator that implements matrices in Morton-order from source that presumes a row-major implementation. Early performance from LAPACK's reference implementation of \texttt{dgesv} (linear solver), and all its supporting routines (including \texttt{dgemm} matrix-multiplication) form a successful research demonstration. Its performance predicts improvements from new algebra in back-end optimizers.We also present results from a more stylish \texttt{dgemm} algorithm that takes better advantage of this representation.  With only routine back-end optimizations inserted by hand (unfolding the base case and passing arguments in registers), we achieve machine performance exceeding that of the manufacturer-crafted {\tt dgemm} running at 6\% of peak flops. And the same code performs similarly on several machines.Together, these results show how existing codes and future block-recursive algorithms can work well together on this matrix representation. Locality is key to future performance, and the new representation has a remarkable impact.},
}

@article{Wise:2001:LSM:568014.379559,
  author =	 {Wise, David S. and Frens, Jeremy D. and Gu, Yuhong
                  and Alexander, Gregory A.},
  title =	 {Language Support for Morton-order Matrices},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {July 2001},
  volume =	 36,
  number =	 7,
  month =	 jun,
  year =	 2001,
  issn =	 {0362-1340},
  pages =	 {24--33},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/568014.379559},
  doi =		 {10.1145/568014.379559},
  acmid =	 379559,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {paging, quadtrees},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=379559&ftid=65671&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 14:11:18>},
  abstract = 	 {The uniform representation of 2-dimensional arrays serially in Morton order (or {\eee} order) supports both their iterative scan with cartesian indices and their divide-and-conquer manipulation as quaternary trees. This data structure is important because it relaxes serious problems of locality and latency, and the tree helps to schedule multi-processing. Results here show how it facilitates algorithms that avoid cache misses and page faults at all levels in hierarchical memory, independently of a specific runtime environment.We have built a rudimentary C-to-C translator that implements matrices in Morton-order from source that presumes a row-major implementation. Early performance from LAPACK's reference implementation of \texttt{dgesv} (linear solver), and all its supporting routines (including \texttt{dgemm} matrix-multiplication) form a successful research demonstration. Its performance predicts improvements from new algebra in back-end optimizers.We also present results from a more stylish \texttt{dgemm} algorithm that takes better advantage of this representation.  With only routine back-end optimizations inserted by hand (unfolding the base case and passing arguments in registers), we achieve machine performance exceeding that of the manufacturer-crafted {\tt dgemm} running at 6\% of peak flops. And the same code performs similarly on several machines.Together, these results show how existing codes and future block-recursive algorithms can work well together on this matrix representation. Locality is key to future performance, and the new representation has a remarkable impact.},
}

@inproceedings{Hiraki:1988:EVP:62972.63020,
  author =	 {Hiraki, K. and Sekiguchi, S. and Shimada, T.},
  title =	 {Efficient Vector Processing on Dataflow
                  Supercomputer SIGMA-1},
  booktitle =	 {Proceedings of the 1988 ACM/IEEE Conference on
                  Supercomputing},
  series =	 {Supercomputing '88},
  year =	 1988,
  isbn =	 {0-8186-0882-X},
  location =	 {Orlando, Florida, USA},
  pages =	 {374--381},
  numpages =	 8,
  url =		 {http://dl.acm.org/citation.cfm?id=62972.63020},
  acmid =	 63020,
  publisher =	 {IEEE Computer Society Press},
  address =	 {Los Alamitos, CA, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=63020&ftid=15390&dwn=1&CFID=744742186&CFTOKEN=77967707},
  review = 	 {fbie: rejected <2016-01-13 14:11:24>},
}

@inproceedings{Surkan:1992:WWS:144045.144142,
  author =	 {Surkan, Alvin J.},
  title =	 {WSDM: Weighted Sparse Distributed Memory Prototype
                  Expressed in APL},
  booktitle =	 {Proceedings of the International Conference on APL},
  series =	 {APL '92},
  year =	 1992,
  isbn =	 {0-89791-477-5},
  location =	 {St. Petersburg, Russia},
  pages =	 {235--242},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/144045.144142},
  doi =		 {10.1145/144045.144142},
  acmid =	 144142,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {

             A functional style application of APL notation succinctly describes the architecture and principles of operation of one kind of connection-based computer. In the future it is expected that these machines will have thousands of processors and large arrays of dynamic connections. APL programs running on von Neumann computers now provide precise descriptions of connection-based machines which are convenient for exploring the potential of connection-based computation. Experience with radically new structures and different principles of operation for neural network problem solving can be obtained using virtual machines provided by software. Virtual machines are described by functions programmed on conventional computers.
Two adaptive variants of the sparse distributed memory or SDM (Kanerva [1991]) show improved efficiency. The demonstrated superiority of Kanerva's new pattern weighting idea can be obtained by improved coding of the input patterns. This coding is done by generally defined preprocessing of features of representative binary input patterns. Transformed input patterns select addresses which pack distributed memories more efficiently.
Coding is done by first computing customized weight vectors for each input pattern vector. Individual weighting of each pattern leads to more uniform utilization of the addresses and their corresponding memory connection weights. The derived pattern weights improve discrimination between pairs of similar inputs with few significant differences. This paper is to provide the APL community access to a concise symbolic description of Kanerva's weighted SDM machine. APL's rich set of computer modeling and exposition tools have the potential of markedly accelerating software and hardware development for array-base connectionist computing. },
  review = 	 {fbie: rejected <2016-01-14 09:55:15>},

}

@article{Surkan:1992:WWS:144052.144142,
  author =	 {Surkan, Alvin J.},
  title =	 {WSDM: Weighted Sparse Distributed Memory Prototype
                  Expressed in APL},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {July 1992},
  volume =	 23,
  number =	 1,
  month =	 jul,
  year =	 1992,
  issn =	 {0163-6006},
  pages =	 {235--242},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/144052.144142},
  doi =		 {10.1145/144052.144142},
  acmid =	 144142,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {

             A functional style application of APL notation succinctly describes the architecture and principles of operation of one kind of connection-based computer. In the future it is expected that these machines will have thousands of processors and large arrays of dynamic connections. APL programs running on von Neumann computers now provide precise descriptions of connection-based machines which are convenient for exploring the potential of connection-based computation. Experience with radically new structures and different principles of operation for neural network problem solving can be obtained using virtual machines provided by software. Virtual machines are described by functions programmed on conventional computers.
Two adaptive variants of the sparse distributed memory or SDM (Kanerva [1991]) show improved efficiency. The demonstrated superiority of Kanerva's new pattern weighting idea can be obtained by improved coding of the input patterns. This coding is done by generally defined preprocessing of features of representative binary input patterns. Transformed input patterns select addresses which pack distributed memories more efficiently.
Coding is done by first computing customized weight vectors for each input pattern vector. Individual weighting of each pattern leads to more uniform utilization of the addresses and their corresponding memory connection weights. The derived pattern weights improve discrimination between pairs of similar inputs with few significant differences. This paper is to provide the APL community access to a concise symbolic description of Kanerva's weighted SDM machine. APL's rich set of computer modeling and exposition tools have the potential of markedly accelerating software and hardware development for array-base connectionist computing. },
  review = 	 {fbie: rejected <2016-01-14 09:55:19>},
}

@article{Fordyce:1991:UBI:114055.114074,
  author =	 {Fordyce, Kenneth and Morreale, Mario and Jantzen,
                  Jan and Sullivan, Gerald},
  title =	 {Using Boolean of Integer Arrays to Analyze Networks},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {Aug. 1991},
  volume =	 21,
  number =	 4,
  month =	 jul,
  year =	 1991,
  issn =	 {0163-6006},
  pages =	 {174--185},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/114055.114074},
  doi =		 {10.1145/114055.114074},
  acmid =	 114074,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=114074&ftid=28883&dwn=1&CFID=744754184&CFTOKEN=77672816},
  abstract = 	 {A critical computational requirement for many of the decision technologies in the fields of operations research (PERT/CPM, Markov chains, decision trees, Bayesian analysis, MRP, simulation, &amp;hellip;), artificial intelligence (evidential reasoning, truth maintenance systems, propositional logic, rule based inference, frames and semantic nets, &amp;hellip;), and decision support systems (worksheet or financial planning models, data / entity models, &amp;hellip;) is the development and manipulation of a function network or directed graph describing the relationship between "variables", "objects", or "actors" involved in the application of the decision technology to a specific problem. The manipulation of such networks or graphs using Boolean matrices and vector of integer vectors is well known in portions of the APL community (see bibliography), but intertwined with specific applications and spread out across a variety sources (some of which are difficult to obtain). This paper succinctly and simply describes the basics of manipulating a function network with Boolean matrices and integer vectors including focusing networks, finding circular conditions (A depends on B, B depends on C, C depends on A, therefore A depends on A, &amp;hellip;), and grouping functions based on relative independence to identify parallel computational opportunities and substantially reduces the non-procedural aspect of the problem.},
  review = 	 {fbie: rejected <2016-01-14 09:57:19>},
}

@inproceedings{Fordyce:1991:UBI:114054.114074,
  author =	 {Fordyce, Kenneth and Morreale, Mario and Jantzen,
                  Jan and Sullivan, Gerald},
  title =	 {Using Boolean of Integer Arrays to Analyze Networks},
  booktitle =	 {Proceedings of the International Conference on APL
                  '91},
  series =	 {APL '91},
  year =	 1991,
  isbn =	 {0-89791-441-4},
  location =	 {Palo Alto, California, USA},
  pages =	 {174--185},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/114054.114074},
  doi =		 {10.1145/114054.114074},
  acmid =	 114074,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=114074&ftid=28883&dwn=1&CFID=744754184&CFTOKEN=77672816},
  abstract = 	 {

             A critical computational requirement for many of the decision technologies in the fields of operations research (PERT/CPM, Markov chains, decision trees, Bayesian analysis, MRP, simulation, &amp;hellip;), artificial intelligence (evidential reasoning, truth maintenance systems, propositional logic, rule based inference, frames and semantic nets, &amp;hellip;), and decision support systems (worksheet or financial planning models, data / entity models, &amp;hellip;) is the development and manipulation of a function network or directed graph describing the relationship between "variables", "objects", or "actors" involved in the application of the decision technology to a specific problem. The manipulation of such networks or graphs using Boolean matrices and vector of integer vectors is well known in portions of the APL community (see bibliography), but intertwined with specific applications and spread out across a variety sources (some of which are difficult to obtain). This paper succinctly and simply describes the basics of manipulating a function network with Boolean matrices and integer vectors including focusing networks, finding circular conditions (A depends on B, B depends on C, C depends on A, therefore A depends on A, &amp;hellip;), and grouping functions based on relative independence to identify parallel computational opportunities and substantially reduces the non-procedural aspect of the problem. },
  review = 	 {fbie: rejected <2016-01-14 09:57:22>},
}

@inproceedings{Chadha:2014:EOI:2628071.2628103,
  author =	 {Chadha, Gaurav and Mahlke, Scott and Narayanasamy,
                  Satish},
  title =	 {EFetch: Optimizing Instruction Fetch for
                  Event-driven Webapplications},
  booktitle =	 {Proceedings of the 23rd International Conference on
                  Parallel Architectures and Compilation},
  series =	 {PACT '14},
  year =	 2014,
  isbn =	 {978-1-4503-2809-8},
  location =	 {Edmonton, AB, Canada},
  pages =	 {75--86},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/2628071.2628103},
  doi =		 {10.1145/2628071.2628103},
  acmid =	 2628103,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {event-driven web applications, instruction
                  prefetching, javascript},

  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=2628103&ftid=1494239&dwn=1&CFID=744754184&CFTOKEN=77672816},
  abstract = 	 {

             Web 2.0 applications written in JavaScript are increasingly popular as they are easy to use, easy to update and maintain, and portable across a wide variety of computing platforms. Web applications receive frequent input from a rich array of sensors, network, and user input modalities. To handle the resulting asynchrony due to these inputs, web applications are developed using an event-driven programming model. These event-driven web applications have dramatically different characteristics, which provides an opportunity to create a customized processor core to improve the responsiveness of web applications. In this paper, we take one step towards creating a core customized to event-driven applications. We observe that instruction cache misses of web applications are substantially higher than conventional server and desktop workloads due to large working sets caused by distant re-use. To mitigate this bottleneck, we propose an instruction prefetcher (EFetch) that is tuned to exploit the characteristics of web applications. We find that an event signature, which captures the current event and function calling context, is a good predictor of the control flow inside a function of an event-driven program. It allows us to accurately predict a function's callees and their function bodies and prefetch them in a timely manner. For a set of real-world web applications, we show that the proposed prefetcher outperforms commonly implemented next-2-line prefetcher by 1\%. Also, it consumes 5.2 times less area than a recently proposed prefetcher, while outperforming it. },
  review = 	 {fbie: rejected <2016-01-14 09:57:42>},
}

@inproceedings{Baxter:1989:RPS:72935.72967,
  author =	 {Baxter, D. and Mirchandaney, R. and Saltz, J. H.},
  title =	 {Run-time Parallelization and Scheduling of Loops},
  booktitle =	 {Proceedings of the First Annual ACM Symposium on
                  Parallel Algorithms and Architectures},
  series =	 {SPAA '89},
  year =	 1989,
  isbn =	 {0-89791-323-X},
  location =	 {Santa Fe, New Mexico, USA},
  pages =	 {303--312},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/72935.72967},
  doi =		 {10.1145/72935.72967},
  acmid =	 72967,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {

                  An abstract is not available.


            },
  notes = 	 {The paper seems incomplete at a first glance, at least one table is missing. I am undecided.},
  review = 	 {fbie: rejected <2016-01-14 12:16:29>},
}

@inproceedings{Moreira:2001:CTA:376656.376831,
  author =	 {Moreira, Jos{\'e} E. and Midkiff, Samuel P. and
                  Gupta, Manish},
  title =	 {A Comparison of Three Approaches to Language,
                  Compiler, and Library Support for Multidimensional
                  Arrays in Java},
  booktitle =	 {Proceedings of the 2001 Joint ACM-ISCOPE Conference
                  on Java Grande},
  series =	 {JGI '01},
  year =	 2001,
  isbn =	 {1-58113-359-6},
  location =	 {Palo Alto, California, USA},
  pages =	 {116--125},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/376656.376831},
  doi =		 {10.1145/376656.376831},
  acmid =	 376831,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  review = 	 {fbie: rejected <2016-01-14 10:10:46>},
  abstract = 	 {The lack of direct support for multidimensional arrays in Java™ has been recognized as a major deficiency in the language's applicability to numerical computing. The typical approach to adding multidimensional arrays to Java has been through class libraries that implement these structures. It has been shown that the class library approach can achieve very high-performance for numerical computing, through the use of compiler techniques and efficient implementations of aggregate array operations. Because of the inconvenience of accessing array elements through method invocations, it is advocated by many that class libraries for multidimensional arrays should be combined with new language syntax to facilitate manipulation of those multidimensional arrays. Another approach that has been discussed in the literature is that of relying exclusively on the JVM to recognize those arrays of arrays that are being used to stimulate multidimensional arrays. This approach can also deliver good performance, but it does not improve the existing interfaces for numerical computing. There is yet a third approach: extending the Java language with new syntactic constructs for multidimensional arrays and directly compiling those constructs to bytecode. The new constructs provide a more convenient interface for numerical computing, without requiring a matching class library. This paper is a comparative discussion or the three approaches to adding multidimensional arrays to Java mentioned above. We present a description of the three approaches, listing the pros and cons of each. We give a more detailed description of the third approach — language constructs translated to bytecode — as it is a new contribution. We compare each of the approaches with regards to functionality, impact on the language and virtual machine specification, implementation efforts, and typical achievable performance. We show that the best choice depends on the relative importance attached to the above metrics.},
}

@article{Tang:1990:CTD:255129.255155,
  author =	 {Tang, Peiyi and Yew, Pen-Chung and Zhu, Chuan-Qi},
  title =	 {Compiler Techniques for Data Synchronization in
                  Nested Parallel Loops},
  journal =	 {SIGARCH Comput. Archit. News},
  issue_date =	 {Sept. 1990},
  volume =	 18,
  number =	 {3b},
  month =	 jun,
  year =	 1990,
  issn =	 {0163-5964},
  pages =	 {177--186},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/255129.255155},
  doi =		 {10.1145/255129.255155},
  acmid =	 255155,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {

             The major source of parallelism in ordinary programs is do loops. When loop iterations of parallelized loops are executed on multiprocessors, the cross-iteration data dependencies need to be enforced by synchronization between processors. Existing data synchronization schemes are either too simple to handle general nested loop structures with non-trivia array subscript functions or inefficient due to the large run-time overhead.
In this paper, we propose a new synchronization scheme based on two data-oriented synchronization instructions: synch_read(x,s) and synch_write(x,s). We present the algorithm to compute the ordering number, s, for each data access. Using our scheme, a parallelizing compiler can parallelize a general nested loop structure with complicated cross-iteration data dependencies. If the computations of ordering numbers cannot be done at compile time, the run-time overhead is smaller than the other existing run-time schemes.
 },
  review = 	 {fbie: accepted <2016-01-14 10:31:31>},
}

@inproceedings{Tang:1990:CTD:77726.255155,
  author =	 {Tang, Peiyi and Yew, Pen-Chung and Zhu, Chuan-Qi},
  title =	 {Compiler Techniques for Data Synchronization in
                  Nested Parallel Loops},
  booktitle =	 {Proceedings of the 4th International Conference on
                  Supercomputing},
  series =	 {ICS '90},
  year =	 1990,
  isbn =	 {0-89791-369-8},
  location =	 {Amsterdam, The Netherlands},
  pages =	 {177--186},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/77726.255155},
  doi =		 {10.1145/77726.255155},
  acmid =	 255155,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {The major source of parallelism in ordinary programs is do loops. When loop iterations of parallelized loops are executed on multiprocessors, the cross-iteration data dependencies need to be enforced by synchronization between processors. Existing data synchronization schemes are either too simple to handle general nested loop structures with non-trivia array subscript functions or inefficient due to the large run-time overhead.
In this paper, we propose a new synchronization scheme based on two data-oriented synchronization instructions: synch_read(x,s) and synch_write(x,s). We present the algorithm to compute the ordering number, s, for each data access. Using our scheme, a parallelizing compiler can parallelize a general nested loop structure with complicated cross-iteration data dependencies. If the computations of ordering numbers cannot be done at compile time, the run-time overhead is smaller than the other existing run-time schemes.},
  review = 	 {fbie: accepted <2016-01-14 10:32:58>},
}

@inproceedings{Ajima:2015:ADG:2832241.2832242,
  author =	 {Ajima, Yuichiro and Nose, Takafumi and Saga,
                  Kazushige and Shida, Naoyuki and Sumimoto, Shinji},
  title =	 {ACPdl: Data-structure and Global Memory Allocator
                  Library over a Thin PGAS-layer},
  booktitle =	 {Proceedings of the First International Workshop on
                  Extreme Scale Programming Models and Middleware},
  series =	 {ESPM '15},
  year =	 2015,
  isbn =	 {978-1-4503-3996-4},
  location =	 {Austin, Texas},
  pages =	 {11--18},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/2832241.2832242},
  doi =		 {10.1145/2832241.2832242},
  acmid =	 2832242,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {PGAS, communication library, data structure, global
                  memory allocator},
  abstract = 	 {HPC systems comprise an increasing number of processor cores towards the exascale computing era. As the number of parallel processes on a system increases, the number of point-to-point connections for each process increases and the memory usage of connections becomes an issue. A new communication library called Advanced Communication Primitives (ACP) is being developed to address the issue by providing communication functions with the Partitioned Global Address Space (PGAS) model that is potentially connection-less. The ACP library is designed to underlie domain-specific languages or parallel language runtimes. The ACP basic layer (ACPbl) comprises a minimum set of functions to abstract interconnect devices and to provide an address translation mechanism. As far as using ACPbl, global address can be granted only to local memory. In this paper, a new set of functions called the ACP data library (ACPdl) including global memory allocator and data-structure library is introduced to improve the productivity of the ACP library. The global memory allocator allocates a memory region of a remote process and assigns global address to it without involving the remote process. The data-structure library uses the global memory allocator internally and provides functions to create, read, update and delete distributed data-structures. Evaluation results of global memory allocator and associative-array data-structure functions show that overhead between the main and communication threads may become a bottleneck when an implementation of ACPbl uses a low latency HPC-dedicated interconnect device.},
  review = 	 {fbie: rejected <2016-01-14 10:42:55>},
}

@article{Surkan:1989:ADF:75145.75191,
  author =	 {Surkan, A.},
  title =	 {APL Descriptions of Functional Building Blocks for
                  Connectionist Computer Models},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {Aug. 1989},
  volume =	 19,
  number =	 4,
  month =	 jul,
  year =	 1989,
  issn =	 {0163-6006},
  pages =	 {334--340},
  numpages =	 7,
  url =		 {http://doi.acm.org/10.1145/75145.75191},
  doi =		 {10.1145/75145.75191},
  acmid =	 75191,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {APL functions provide concise, array-oriented expressions for describing connection-based computing. These give insights into a variety of neural network-like computer models. Functional decomposition into inherently parallel operation of collective or memory-based computing is effected with the identification of several command-like APL simulation functions.These functions serve to: (a) generate and configure initial values and structures for network layers, connection weights and thresholds, (b) strategically sample training sets to provide acyclic sequences of patterns, (c) update weights for memorizing or learning from the training set, (d) locate units effective for storing and retrieving memories, and (e) use a cue or noise-corrupted pattern for recalling or reconstructing memory associations established during training. Also, APL functions are used to accomodate for data values missing from the training patterns. Other simply described functions generate random fixed reference addresses and initialize modifiable weight values. Test patterns with specified levels of noise contamination are generated by simple APL expressions.},
  review = 	 {fbie: rejected <2016-01-14 10:43:22>},
}

@inproceedings{Surkan:1989:ADF:75144.75191,
  author =	 {Surkan, A.},
  title =	 {APL Descriptions of Functional Building Blocks for
                  Connectionist Computer Models},
  booktitle =	 {Conference Proceedings on APL As a Tool of Thought},
  series =	 {APL '89},
  year =	 1989,
  isbn =	 {0-89791-327-2},
  location =	 {New York City, New York, USA},
  pages =	 {334--340},
  numpages =	 7,
  url =		 {http://doi.acm.org/10.1145/75144.75191},
  doi =		 {10.1145/75144.75191},
  acmid =	 75191,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {APL functions provide concise, array-oriented expressions for describing connection-based computing. These give insights into a variety of neural network-like computer models. Functional decomposition into inherently parallel operation of collective or memory-based computing is effected with the identification of several command-like APL simulation functions.These functions serve to: (a) generate and configure initial values and structures for network layers, connection weights and thresholds, (b) strategically sample training sets to provide acyclic sequences of patterns, (c) update weights for memorizing or learning from the training set, (d) locate units effective for storing and retrieving memories, and (e) use a cue or noise-corrupted pattern for recalling or reconstructing memory associations established during training. Also, APL functions are used to accomodate for data values missing from the training patterns. Other simply described functions generate random fixed reference addresses and initialize modifiable weight values. Test patterns with specified levels of noise contamination are generated by simple APL expressions.},
  review = 	 {fbie: rejected <2016-01-14 10:43:52>},
}

@article{Rogers:1995:SDD:201059.201065,
  author =	 {Rogers, Anne and Carlisle, Martin C. and Reppy, John
                  H. and Hendren, Laurie J.},
  title =	 {Supporting Dynamic Data Structures on
                  Distributed-memory Machines},
  journal =	 {ACM Trans. Program. Lang. Syst.},
  issue_date =	 {March 1995},
  volume =	 17,
  number =	 2,
  month =	 mar,
  year =	 1995,
  issn =	 {0164-0925},
  pages =	 {233--263},
  numpages =	 31,
  url =		 {http://doi.acm.org/10.1145/201059.201065},
  doi =		 {10.1145/201059.201065},
  acmid =	 201065,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {dynamic data structures},
  abstract = 	 {Compiling for distributed-memory machines has been a very active research area in recent years. Much of this work has concentrated on programs that use arrays as their primary data structures. To date, little work has been done to address the problem of supporting programs that use pointer-based dynamic data structures. The techniques developed for supporting SPMD execution of array-based programs rely on the fact that arrays are statically defined and directly addressable. Recursive data structures do not have these properties, so new techniques must be developed. In this article, we describe an execution model for supporting programs that use pointer-based dynamic data structures. This model uses a simple mechanism for migrating a thread of control based on the layout of heap-allocated data and introduces parallelism using a technique based on futures and lazy task creation. We intend to exploit this execution model using compiler analyses and automatic parallelization techniques. We have implemented a prototype system, which we call Olden, that runs on the Intel iPSC/860 and the Thinking Machines CM-5. We discuss our implementation and report on experiments with five benchmarks.},
  review = 	 {fbie: rejected <2016-01-14 10:44:03>},
}

@inproceedings{Pinto:1988:DSP:322609.323146,
  author =	 {Pinto, Domenick J.},
  title =	 {Data Structures: Pointers vs. Arrays.  When, Where
                  and Why},
  booktitle =	 {Proceedings of the 1988 ACM Sixteenth Annual
                  Conference on Computer Science},
  series =	 {CSC '88},
  year =	 1988,
  isbn =	 {0-89791-260-8},
  location =	 {Atlanta, Georgia, USA},
  pages =	 {679--},
  url =		 {http://doi.acm.org/10.1145/322609.323146},
  doi =		 {10.1145/322609.323146},
  acmid =	 323146,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {In the inevitable search for the “perfect” structure, the beginning programmer is faced with a multitude of possibilities. Stacks, arrays, queues, linked lists, trees, and, on a higher level, the decision whether the “best” choice would involve a physical implementation of arrays or pointers. The choice, particularly for the novice, is not easy. The purpose of the abstract is to facilitate that choice.
Having perused a dozen textbooks dealing with introduction to data structures (the greater majority of which employ Pascal) one finds the “expected” comparative analysis between dynamic and static variables. The prevalent tendency indicates that the utilization of pointer (dynamic) variables more effectively controls memory allocation with the result being a generally more effective, possibly more expedient execution.
The strange occurrence that I could not help the questioning was involved with the reasons why most treatments of data structures devote more time to the static approach as opposed to the dynamic one. And, on a different note, why so few textbooks actually compared the two parallel approaches for the same problem.
To alleviate this, I considered solving a simple problem both ways. We wish to create a binary search tree from a list of up to 50000 numbers and, in addition, remove any duplicates from the list. A reasonable approach to this would be to statically declare an array of 50000 nodes, using subroutines to “getnode” (using an available pool of nodes created within the program), create a tree (i.e., the root), setleft and setright (to properly place an item in the tree). Such a program involves 5 modules, about 80 lines of code, a run time of under 2 seconds, 1089 page faults, and a peak page file (indicating the use of memory) of 3253.
On the other hand, if one writes the program using pointer variables, the “getnode” procedure mentioned above can be replaced by a simple “new” function (for obtaining a new node), no storage for the numbers is required within the program and we see the following results: 72 lines of code, 4 modules, a run time of under 2 seconds, 861 page faults, and a peak page file of 924.
An interesting observation is that the only “significant” difference in the “statistics” of the two versions is in the peak page file. There was emphatically no difference in run time.
It would seem useful, therefore, to allow the novice programmer the option of attempting both approaches, while encouraging such comparisons as the one noted above. The “best” approach may not be an acceptable one for a particular user who feels burdened by a concept not fully understood. Thus, the when, where, and why can only be asked and answered by the programmer who truly experiences the greatest magnitude.},
  review = 	 {fbie: rejected <2016-01-14 10:44:59>},
}

@article{Koster:1985:CAP:255315.255327,
  author =	 {Koster, Alexis},
  title =	 {Compiling APL for Parallel Execution on an FFP
                  Machine},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {May 12, 1985},
  volume =	 15,
  number =	 4,
  month =	 may,
  year =	 1985,
  issn =	 {0163-6006},
  pages =	 {29--37},
  numpages =	 9,
  url =		 {http://doi.acm.org/10.1145/255315.255327},
  doi =		 {10.1145/255315.255327},
  acmid =	 255327,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {This paper investigates the features of an APL compiler designed for a cellular computer (the FFP machine). This machine directly executes Formal Functional Programming (FFP) code. The APL compiler generates parallel FFP code. Techniques for parallel implementation of many APL array processing operators and functions are described. The implicit parallelism between different operations in a given APL program is also exploited. The FFP machine can accommodate the almost unbounded parallelism of FFP programs. Execution time analysis shows potential for highly efficient execution of APL programs on this machine.},
  review = 	 {fbie: rejected <2016-01-14 10:46:27>},
}

@inproceedings{Koster:1985:CAP:17701.255327,
  author =	 {Koster, Alexis},
  title =	 {Compiling APL for Parallel Execution on an FFP
                  Machine},
  booktitle =	 {Proceedings of the International Conference on APL:
                  APL and the Future},
  series =	 {APL '85},
  year =	 1985,
  isbn =	 {0-897-91157-1},
  location =	 {Seattle, Washington, USA},
  pages =	 {29--37},
  numpages =	 9,
  url =		 {http://doi.acm.org/10.1145/17701.255327},
  doi =		 {10.1145/17701.255327},
  acmid =	 255327,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {This paper investigates the features of an APL compiler designed for a cellular computer (the FFP machine). This machine directly executes Formal Functional Programming (FFP) code. The APL compiler generates parallel FFP code. Techniques for parallel implementation of many APL array processing operators and functions are described. The implicit parallelism between different operations in a given APL program is also exploited. The FFP machine can accommodate the almost unbounded parallelism of FFP programs. Execution time analysis shows potential for highly efficient execution of APL programs on this machine.},
  review = 	 {fbie: rejected <2016-01-14 10:46:42>},
}

@inproceedings{El-Ghazawi:2005:RCE:1046192.1046263,
  author =	 {El-Ghazawi, Tarek and Gaj, Kris and Alexandridis,
                  Nikitas and Michalski, Allen and Fidanci, Devrim and
                  Taher, Mohamed and El-Araby, Esam and Chitalwala,
                  Esmail and Saha, Proshanta},
  title =	 {Reconfigurable Computers: An Empirical Analysis
                  (Abstract Only)},
  booktitle =	 {Proceedings of the 2005 ACM/SIGDA 13th International
                  Symposium on Field-programmable Gate Arrays},
  series =	 {FPGA '05},
  year =	 2005,
  isbn =	 {1-59593-029-9},
  location =	 {Monterey, California, USA},
  pages =	 {278--278},
  numpages =	 1,
  url =		 {http://doi.acm.org/10.1145/1046192.1046263},
  doi =		 {10.1145/1046192.1046263},
  acmid =	 1046263,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {Reconfigurable Computers are parallel systems that are designed around multiple general-purpose processors and multiple field programmable gate array (FPGA) chips. These systems can leverage the synergism between conventional processors and FPGAs to provide low-level hardware functionality at the same level of programmability as general-purpose computers. In this work we conduct an experimental study using one of the state-of-the-art reconfigurable computers and a representative set of applications to assess the field, uncover the challenges, propose solutions, and conceive a realistic evolution path. We consider issues of concern including performance/cost. We also consider productivity in the sense of development, compiling, running, and system reliability. It will be shown that for some applications, the performance/cost can be orders of magnitude better than conventional computers. It will be also shown that programming such machines may still require some hardware knowledge, similar to hardware knowledge computer programmers must acquire to write scalable programs.},
  review = 	 {fbie: rejected <2016-01-14 10:46:50>},
}

@inproceedings{Saha:2015:IHS:2701126.2701213,
  author =	 {Saha, P. and Kumar, D. and Sharma, A.},
  title =	 {Implementation of High Speed Processor for Computing
                  Convolution Sum for DSP Applications},
  booktitle =	 {Proceedings of the 9th International Conference on
                  Ubiquitous Information Management and Communication},
  series =	 {IMCOM '15},
  year =	 2015,
  isbn =	 {978-1-4503-3377-1},
  location =	 {Bali, Indonesia},
  pages =	 {51:1--51:6},
  articleno =	 51,
  numpages =	 6,
  url =		 {http://doi.acm.org/10.1145/2701126.2701213},
  doi =		 {10.1145/2701126.2701213},
  acmid =	 2701213,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {binary coded decimal (BCD), circular convolution,
                  high speed, linear convolution, multiplier and
                  accumulate (MAC), systolic array},
  abstract = 	 {In this paper, we report on transistor level (ASIC) implementation of high speed processor for the first time, for computing convolution sum (linear and circular) of two sequences. The convolution sum was calculated through the orientation of the sequences into array formation for parallel processing, owing towards high speed architecture design. The architectures were implemented and functionality was verified through spice simulator. The mathematical transformation along with binary coded decimal(BCD) arithmetic have been incorporated for the practical implementation of convolution sum, ensure substantial reduction of propagation delay in comparison with earlier reported architectures. The performance parameters such as propagation delay, dynamic switching power consumption were calculated through spice using 90nm CMOS technology. The propagation delay of the resulting 4-point linear convolution was only ~12.79ns and consumes ~27.56mW power with a layout area of ~9.06mm2. Moreover, performance parameters of 4-point circular convolution were ~14.8ns propagation delay with ~37.8mW switching power consumption.},
  review = 	 {fbie: rejected <2016-01-14 10:47:04>},
}

@inproceedings{Tsuda:1989:AVC:76263.76283,
  author =	 {Tsuda, T. and Kunieda, Y. and Atipas, P.},
  title =	 {Automatic Vectorization of Character String
                  Manipulation and Relational Operations in Pascal},
  booktitle =	 {Proceedings of the 1989 ACM/IEEE Conference on
                  Supercomputing},
  series =	 {Supercomputing '89},
  year =	 1989,
  isbn =	 {0-89791-341-8},
  location =	 {Reno, Nevada, USA},
  pages =	 {187--196},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/76263.76283},
  doi =		 {10.1145/76263.76283},
  acmid =	 76283,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {In our paper of Supercomputing '88, an overview of V-Pascal, an automatic vectorizing compiler for Pascal, was presented with focus on its Version 1. In that paper, as one of those higher functions to be added to Version 2 V-Pascal, vector-mode execution of nonnummeric operations such as relational database operations and nonnumeric data manipulations was considered. This paper describes the actual results we have obtained. These results are important in that a new vista has been opened up for vector supercomputers which are originally designed solely for high-speed manipulations of scientific numerical data.
More concretely, the compiler V-Pascal has acquired the ability to automatically vectorize Pascal programs that compare/assign massive data of character strings and those programs which prescribe time-consuming relational operations such as 'join' and others for relational database manipulation. Timing results demonstrate that these nonnumeric operations are performed in the regime of vector performance.},
  review = 	 {fbie: rejected <2016-01-14 10:49:40>},
}

@article{Thomasian:2014:AFR:2658850.2628913,
  author =	 {Thomasian, Alexander},
  title =	 {Analysis of Fork/Join and Related Queueing Systems},
  journal =	 {ACM Comput. Surv.},
  issue_date =	 {January 2015},
  volume =	 47,
  number =	 2,
  month =	 aug,
  year =	 2014,
  issn =	 {0360-0300},
  pages =	 {17:1--17:71},
  articleno =	 17,
  numpages =	 71,
  url =		 {http://doi.acm.org/10.1145/2628913},
  doi =		 {10.1145/2628913},
  acmid =	 2628913,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Fork-join queueing system, MapReduce, Markov chain
                  model, RAID5 disk array, bulk arrivals,
                  characteristic maximum, independent server model,
                  linear programming, lock contention, matrix
                  geometric method, mirrored disk, order statistics,
                  parallel processing, queueing network model,
                  replicated databases, serialization delay,
                  split-merge queueing system, stream processing, task
                  systems, team service model, vacationing server
                  model},
  abstract = 	 {Fork/join (F/J) requests arise in contexts such as parallel computing, query processing in parallel databases, and parallel disk access in RAID. F/J requests spawn K tasks that are sent to K parallel servers, and the completion of all K tasks marks the completion of an F/J request. The exact formula for the mean response time of K = 2-way F/J requests derived under Markovian assumptions (RF/J2) served as the starting point for an approximate expression for RF/JK for 2 < K ≤ 32. When servers process independent requests in addition to F/J requests, the mean response time of F/J requests is better approximated by RmaxK, which is the maximum of the response times of tasks constituting F/J requests. RmaxK is easier to compute and serves as an upper bound to RF/JK. We discuss techniques to compute RmaxK and generally the maximum of K random variables denoting the processing times of the tasks of a parallel computation &Xmacr;maxK. Graph models of computations such as Petri nets—a more general form of parallelism than F/J requests—are also discussed in this work. Jobs with precedence constraints may require multiple resources, which are represented by a queueing network model. We also discuss various queueing systems related to F/J queueing systems and outline their analysis.},
  review = 	 {fbie: rejected <2016-01-14 10:54:53>},
}

@article{Shterenlikht:2015:FC:2754942.2754944,
  author =	 {Shterenlikht, Anton and Margetts, Lee and Cebamanos,
                  Luis and Henty, David},
  title =	 {Fortran 2008 Coarrays},
  journal =	 {SIGPLAN Fortran Forum},
  issue_date =	 {April 2015},
  volume =	 34,
  number =	 1,
  month =	 apr,
  year =	 2015,
  issn =	 {1061-7264},
  pages =	 {10--30},
  numpages =	 21,
  url =		 {http://doi.acm.org/10.1145/2754942.2754944},
  doi =		 {10.1145/2754942.2754944},
  acmid =	 2754944,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {Coarrays are a Fortran 2008 standard feature intended for SIMD type parallel programming. The runtime environment starts a number of identical executable images of the coarray program, on multiple processors, which could be actual physical processors or threads. Each image has a unique number and its private address space. Ordinary variables are private to an image. Coarray variables are available for read/write access from any other image. Coarray communications are of "single sided" type, i.e. a remote call from imageA to image B does not need to be accompanied by a corresponding call in image B. This feature makes coarray programming a lot simpler than MPI. The standard provides synchronisation intrinsics to help avoid race conditions or deadlocks. Any ordinary variable can be made into a coarray - scalars, arrays, intrinsic or derived data types, pointers, allocatables are all allowed. Coarrays can be declared in, and passed to, procedures. Coarrays are thus very flexible and can be used for a number of purposes. For example a collection of coarrays from all or some images can be thought of as a large single array. This is precisely the inverse of the model partitioning logic, typical in MPI programs. A coarray program can exploit functional parallelism too, by delegating dis- tinct tasks to separate images or teams of images. Coarray collectives are expected to become a part of the next version of the Fortran standard. A major unresolved problem of coarray programming is the lack of standard parallel I/O facility in Fortran. In this paper several simple complete coarray programs are shown and compared to alternative parallel technologies - OpenMP, MPI and Fortran 2008 intrinsic "do concurrent". Inter image communication patterns and data transfer are illustrated. An example of a materials microstructure simulation coarray program scaled up to 32k cores is shown. Problems with coarray I/O at this scale are highlighted and addressed with the use of MPI-I/O. A hybrid MPI/coarray programming is discussed and illustrated with a finite element/cellular automata (CAFÃ) multi-scale model. The paper completes with a description of the new coarray language features, expected in the 2015 Fortran standard, and with a brief list of coarray resources},
  review = 	 {fbie: rejected <2016-01-14 10:57:16>},
}

@inproceedings{Chakrabarti:2001:GOT:377792.377827,
  author =	 {Chakrabarti, Dhruva R. and Banerjee, Prithviraj},
  title =	 {Global Optimization Techniques for Automatic
                  Parallelization of Hybrid Applications},
  booktitle =	 {Proceedings of the 15th International Conference on
                  Supercomputing},
  series =	 {ICS '01},
  year =	 2001,
  isbn =	 {1-58113-410-X},
  location =	 {Sorrento, Italy},
  pages =	 {166--180},
  numpages =	 15,
  url =		 {http://doi.acm.org/10.1145/377792.377827},
  doi =		 {10.1145/377792.377827},
  acmid =	 377827,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {This paper presents a novel technique to perform global optimization of communication and preprocessing calls in the presence of array accesses with arbitrary subscripts. Our scheme is presented in the context of automatic parallelization of sequential programs to produce message passing programs for execution on distributed machines. We use the static single assignment (SSA) form for message passing programs as the intermediate representation and then present techniques to perform global optimizations even in the presence of array accesses with arbitrary subscripts. The focus of this paper is in showing that, using a uniform compilation method both at compile-time and at run-time, our framework is able to determine the earliest and the latest legal communication point for a certain distributed array reference even in the presence of arbitrary array addressing functions. Our scheme then heuristically determines the final communication point after considering the interaction between the relevant communication schedules. Owing to combined static and dynamic analysis, a quasi-dynamic method of code generation is implemented. We describe the need for proper interaction between the compiler and the run-time routines for efficient implementation of optimizations as well as for compatible code generation. All of the analyses is initiated at compile-time, static analyses of the program is done as much as possible, and then the run-time routines take over the analyses while building on the data structures initiated at compile time. This scheme has been incorporated in our compiler framework which can use uniform methods to compile, parallelize, and optimize a sequential program irrespective of the subscripts used in array addressing functions. Experimental results for a number of benchmarks on an IBM SP-2 show up to around 10-2\% reduction in total run-times in our globally-optimized schemes compared to other state-of-the-art schemes on 16 processors.},
  review = 	 {fbie: rejected <2016-01-14 10:58:40>},
}

@inproceedings{Chin:2003:EST:777388.777397,
  author =	 {Chin, Wei-Ngan and Khoo, Siau-Cheng and Xu, Dana N.},
  title =	 {Extending Sized Type with Collection Analysis},
  booktitle =	 {Proceedings of the 2003 ACM SIGPLAN Workshop on
                  Partial Evaluation and Semantics-based Program
                  Manipulation},
  series =	 {PEPM '03},
  year =	 2003,
  isbn =	 {1-58113-667-6},
  location =	 {San Diego, California, USA},
  pages =	 {75--84},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/777388.777397},
  doi =		 {10.1145/777388.777397},
  acmid =	 777397,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {collection analysis, fix-point, mixed constraints,
                  polymorphism, sized type},
  abstract = 	 {Many program optimizations and analyses, such as array-bounds checking, termination analysis, depend on knowing the size of a function's input and output. However, size information can be difficult to compute. Firstly, accurate size computation requires detecting a size relation between different inputs of a function. Secondly, size information may also be contained inside a collection (data structure with multiple elements). In this paper, we introduce some techniques to derive universal and existential size properties over collections of elements of recursive data structures. We shall show how a mixed constraint system could support the enhanced size type, and highlight examples where collection analysis are useful.},
  review = 	 {fbie: rejected <2016-01-14 10:59:30>},
}

@article{Chin:2003:EST:966049.777397,
  author =	 {Chin, Wei-Ngan and Khoo, Siau-Cheng and Xu, Dana N.},
  title =	 {Extending Sized Type with Collection Analysis},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {October 2003},
  volume =	 38,
  number =	 10,
  month =	 jun,
  year =	 2003,
  issn =	 {0362-1340},
  pages =	 {75--84},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/966049.777397},
  doi =		 {10.1145/966049.777397},
  acmid =	 777397,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {collection analysis, fix-point, mixed constraints,
                  polymorphism, sized type},
  abstract = 	 {Many program optimizations and analyses, such as array-bounds checking, termination analysis, depend on knowing the size of a function's input and output. However, size information can be difficult to compute. Firstly, accurate size computation requires detecting a size relation between different inputs of a function. Secondly, size information may also be contained inside a collection (data structure with multiple elements). In this paper, we introduce some techniques to derive universal and existential size properties over collections of elements of recursive data structures. We shall show how a mixed constraint system could support the enhanced size type, and highlight examples where collection analysis are useful.},
  review = 	 {fbie: rejected <2016-01-14 10:59:36>},
}

@inproceedings{Oliver:2005:HCP:1046192.1046222,
  author =	 {Oliver, Tim and Schmidt, Bertil and Maskell,
                  Douglas},
  title =	 {Hyper Customized Processors for Bio-sequence
                  Database Scanning on FPGAs},
  booktitle =	 {Proceedings of the 2005 ACM/SIGDA 13th International
                  Symposium on Field-programmable Gate Arrays},
  series =	 {FPGA '05},
  year =	 2005,
  isbn =	 {1-59593-029-9},
  location =	 {Monterey, California, USA},
  pages =	 {229--237},
  numpages =	 9,
  url =		 {http://doi.acm.org/10.1145/1046192.1046222},
  doi =		 {10.1145/1046192.1046222},
  acmid =	 1046222,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {FPGA, Smith-Waterman, bio-informatics, dynamic
                  re-configuration},
  abstract = 	 {Protein sequences with unknown functionality are often compared to a set of known sequences to detect functional similarities. Efficient dynamic-programming algorithms exist for solving this problem, however current solutions still require significant scan times. These scan time requirements are likely to become even more severe due to exponential database growth. In this paper we present a new approach to bio-sequence database scanning using re-configurable FPGA-based hardware platforms to gain high performance at low cost. Efficient mappings of the Smith-Waterman algorithm using fine-grained parallel processing elements (PEs) that are tailored towards the parameters of a query have been designed. We use customization opportunities available at run-time to dynamically hyper customize the systolic array to make better use of available resource. Our FPGA implementation achieves a speedup of approximately 170 for linear gap penalties and 125 for affine gap penalties as compared to a standard desktop computing platform. We show how hyper-customization at run-time can be used to further improve the performance.},
  review = 	 {fbie: rejected <2016-01-14 10:59:49>},
}

@inproceedings{Sanchez:1997:SLA:522659.825636,
  author =	 {Sanchez, F. Jesus and Gonzalez, Antonio and Valero,
                  Mateo},
  title =	 {Static Locality Analysis for Cache Management},
  booktitle =	 {Proceedings of the 1997 International Conference on
                  Parallel Architectures and Compilation Techniques},
  series =	 {PACT '97},
  year =	 1997,
  isbn =	 {0-8186-8090-3},
  pages =	 {261--},
  url =		 {http://dl.acm.org/citation.cfm?id=522659.825636},
  acmid =	 825636,
  publisher =	 {IEEE Computer Society},
  address =	 {Washington, DC, USA},
  keywords =	 {Locality analysis, Selective Cache, Dual Data Cache},
  abstract = 	 {Most memory references in numerical codes correspond to array references whose indices are affine functions of surrounding loop indices. These array references follow a regular predictable memory pattern that can be analyzed at compile time. This analysis can provide valuable information like the locality exhibited by the program, which can be used to implement a more intelligent caching strategy. In this paper we propose a static locality analysis oriented to the management of data caches. We show that previous proposals on locality analysis are not appropriate when the programs have a high conflict miss ratio. This paper extends those proposals by introducing a compile time interference analysis that significantly improve the performance of them. We first show how this analysis can be used to characterize the dynamic locality properties of numerical codes. This evaluation show for instance that a large percentage of references exhibit only temporal locality and another significant percentage does not exhibit any type of locality. This motivates the use of a dual data cache, which has a module specialized to exploit temporal locality, and a selective cache respectively. Then, the performance provided by these two cache organizations is evaluated. In both organizations, the static locality analysis is responsible for tagging each memory instruction accordingly to the particular type(s) of locality that it exhibits.},
  notes = 	 {The paper seems not to be available from ACM.},
  review = 	 {fbie: rejected <2016-01-14 12:16:40>},
}

@inproceedings{Krothapalli:1990:EPT:100348.100461,
  author =	 {Krothapalli, V. P. and Sadayappan, P.},
  title =	 {Exploiting Parallelism Through Run-time Analysis on
                  a Vector Processor (Abstract)},
  booktitle =	 {Proceedings of the 1990 ACM Annual Conference on
                  Cooperation},
  series =	 {CSC '90},
  year =	 1990,
  isbn =	 {0-89791-348-5},
  location =	 {Washington, D.C., USA},
  pages =	 {434--},
  url =		 {http://doi.acm.org/10.1145/100348.100461},
  doi =		 {10.1145/100348.100461},
  acmid =	 100461,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {Several vendors such as Cray, Fujitsu, NEC supply vector processors that use pipelined functional units. A pipelined functional unit can extract parallelism by operating on a stream of operands simultaneously, with each stage in the pipeline working on a different set of operands. Users of these machines typically program in FORTRAN, and rely on an optimizing compiler (vectorizer) to extract parallelism. These compilers analyze loops in a program and generate a vector instruction for each type of operation that can be performed simultaneously on different elements of an array. When the independence of operations on different elements of an array is not compile-time determinable, as for example is the case with unstructured sparse matrix computations, these compilers generate scalar code.
In this paper we propose a scheme based on run-time dependence analysis to vectorize loops with dependencies unknown at compile-time. Our approach analyses a loop at run-time by executing a skeleton of the loop and assigns a level to each operation. Operations that are independent of any other operations in the loop are given a level of one. Other operations are given a level one greater than the maximum of levels of all dependent operations. A vector instruction, using indirect operand access, is generated for all the operations of a given type at each level. The approach was evaluated by implementing a sparse triangular solver on one processor of a Cray X-MP. A number of sparse matrices derived from the application domain of circuit simulation were used. An improvement in performance ranging between 100-500.},
  review = 	 {fbie: rejected <2016-01-14 11:04:08>},
}

@inproceedings{Olmstead:1987:CPA:322917.323085,
  author =	 {Olmstead, Roger},
  title =	 {Compilers and Parallel Architectures (Abstract
                  Only): Sequential to Parallel Mapping Strategies},
  booktitle =	 {Proceedings of the 15th Annual Conference on
                  Computer Science},
  series =	 {CSC '87},
  year =	 1987,
  isbn =	 {0-89791-218-7},
  location =	 {St. Louis, Missouri, USA},
  pages =	 {424--},
  url =		 {http://doi.acm.org/10.1145/322917.323085},
  doi =		 {10.1145/322917.323085},
  acmid =	 323085,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {The parallel optimizing compiler is offered as the only viable means of fully exploiting the power of parallel architectures and applying it to mainstream computing problems. In this context, “mainstream” includes -but should not be limited to- scientific computing, which generally implies the solving of partial differential equations, fast Fourier transforms, matrix manipulations, etc. Basically our motivation is to provide the fast execution of a broad class of programs.
The requirements of general parallel computing are examined from an engineering perspective and architectures are evaluated in the context of accepted standards, These standards include cost, reliability/fault tolerance, and performance. To simplify the evaluation process, the standards are applied to classes rather than individual architectures. Two classes of architectures exist: algorithmically and computationally specialized architectures (e.g., systolic arrays, SIMD machines, associative processors) and computationally generalized architectures (e.g., MIMD homogeneous multiprocessors with shared central memory). As each standard is applied, generalized architectures invariably prove superior. Although some algorithmically specialized computers, (e.g., systolic arrays) are admittedly less expensive than their more generalized counterparts (though this is not the case with most SIMD machines), total system cost rapidly becomes prohibitive when one considers the vast number of possible algorithms encountered in solving mainstream computing problems. Reliability also favors the generalized machine. Clearly, a machine rich in data paths and redundancy offers superior fault tolerance. Concerning performance, while it is true that algorithmically specialized architectures can occasionally provide nearly optimal solutions to certain specific problems, generalized architectures can produce both high-throughput and rapid turnaround. Indeed, a generalized machine might make use of one or more algorithmically specialized architectures, much like a standard system maintains a library of specialized software routines. In sum, specialized architectural approaches are fundamentally flawed and not sufficient to solve mainstream computing problems.
Having opted for generalized parallel architectures, two approaches for their exploitation are evaluated: parallel languages and parallel optimizing compilers. Languages include both those that were designed for parallel programming, like Occam, as well as those sequential languages with added parallel constructs. Standards, similar to those employed above but modified for application to software, are used to evaluate each approach. These standards also include cost and performance, but in the place of reliability, maintainability, and portability are substituted. When examined in light of these standards, the parallel language approach is shown to be inferior in nearly every case. First, software science has shown, empirically, that increasing the quantity of instructions required to perform a given task has a direct, and negative, effect on both programmer productivity and software quality. Thus, the use of additional constructs required to express an algorithm using a parallel language has an adverse effect on the overall cost of the program; and although hand coded programs often more efficient than their compiled counterparts, industry studies have shown that the gain in performance attributable to hand coding is less than twenty percent. Programs coded in a parallel language are also more difficult to maintain. For example, a program might be modified in such a way that the original process-partitioning is no longer optimal; still worse, it may no longer function at all. This problem is not encountered when utilizing sequential programming languages. Concerning portability, if the source program written in a parallel language is moved to a machine with a different architecture (i.e., one having dissimilar process creation, synchronization, and communication costs), it will be less efficient and, more likely, incorrect. It becomes obvious that the criticisms leveled against parallel languages are similar to those favoring the use of high order, rather than assembly languages. This issue points to an underlying philosophical tenet: that is, humans should be freed, whenever possible, from concern over details that can best be left to the automation. Finally, and most importantly, recent research in psycho-linguistics has demonstrated that the structure of natural language is fundamentally sequential. The human brain is inherently designed to map even parallel ideas onto a sequential medium. Thus, the parallel language approach is inimical to human cognitive processes and its application for widespread use in parallel computing is extremely limited.
It is clear then, that our goal should be the development of powerful, generalized parallel architectures and the design of compilers capable of extracting the maximum amount of parallelism inherent in a program.},
  notes = 	 {This is a really interesting abstract, supporting many of our claims. However, it seems irrelevant for the literature review at hand, as it also does not produce any new insights.},
  review = 	 {fbie: rejected <2016-01-14 11:09:53>},
}

@inproceedings{Vulov:2011:BFS:2431518.2431870,
  author =	 {Vulov, George and Hou, Cong and Vuduc, Richard and
                  Fujimoto, Richard and Quinlan, Daniel and Jefferson,
                  David},
  title =	 {The Backstroke Framework for Source Level Reverse
                  Computation Applied to Parallel Discrete Event
                  Simulation},
  booktitle =	 {Proceedings of the Winter Simulation Conference},
  series =	 {WSC '11},
  year =	 2011,
  location =	 {Phoenix, Arizona},
  pages =	 {2965--2979},
  numpages =	 15,
  url =		 {http://dl.acm.org/citation.cfm?id=2431518.2431870},
  acmid =	 2431870,
  publisher =	 {Winter Simulation Conference},
  abstract = 	 {We introduce Backstroke, a new open source framework for the automatic generation of reverse code for functions written in C++. Backstroke enables reverse computation for optimistic parallel discrete event simulations. It is built using the ROSE open-source compiler infrastructure, and handles complex C++ features including pointers and pointer types, arrays, function and method calls, class types, inheritance, polymorphism, virtual functions, abstract classes, templated classes and containers. Backstroke also introduces new program inversion techniques based on advanced compiler analysis tools built into ROSE. We explore and illustrate some of the complex language and semantic issues that arise in generating correct reverse code for C++ functions.},
  review = 	 {fbie: rejected <2016-01-14 11:11:07>},
}

@inproceedings{Duro:2010:OVF:1866210.1866217,
  author =	 {Duro, Nuno and Santos, Rui and Louren\c{c}o,
                  Jo\~{a}o and Paulino, Herv{\'e} and Martins,
                  Jo\~{a}o},
  title =	 {Open Virtualization Framework for Testing Ground
                  Systems},
  booktitle =	 {Proceedings of the 8th Workshop on Parallel and
                  Distributed Systems: Testing, Analysis, and
                  Debugging},
  series =	 {PADTAD '10},
  year =	 2010,
  isbn =	 {978-1-4503-0136-7},
  location =	 {Trento, Italy},
  pages =	 {67--73},
  numpages =	 7,
  url =		 {http://doi.acm.org/10.1145/1866210.1866217},
  doi =		 {10.1145/1866210.1866217},
  acmid =	 1866217,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {applications, configuration, deployment,
                  infrastructures, management, virtualization},
  abstract = 	 {The recent developments in virtualization change completely the panorama of the Hardware/OS deployment. New bottlenecks arise in the deployment of application stacks, where IT industry will spend most of the time to assure automation. VIRTU tool aims at managing, configuring and testing distributed ground applications of space systems on a virtualized environment, based on open tools and cross virtualization support. This tool is a spin-off of previous activities performed by the European Space Operations Center (ESOC) and thus it covers the original needs from the ground data systems infrastructure division of the European Space Agency. VIRTU is a testing oriented solution. Its ability to group several virtual machines in an assembly provides the means to easily deploy a full testing infrastructure, including the client/server relationships. The possibility of making on-demand request of the testing infrastructure will provide some infrastructure optimizations, specially having in mind that ESA maintains Ground Control software of various missions, and each mission cam potentially have a different set of System baselines and last up to 15 years. The matrix array of supported system combinations is therefore enormous and any improvement on the process provides substantial benefits to ESA, by reducing the effort and schedule of each maintenance activity. The ESOC's case study focuses on the development and validation activities of infrastructure or mission Ground Systems solutions. The Ground Systems solutions are typically composed of distributed systems that could take advantage of virtualized environments for testing purposes. Virtualization is used as way to optimize maintenance for tasks such as testing new releases and patches, test different system's configurations and replicate tests. The main benefits identified are related to deployment test environment and the possibility to have on-demand infrastructure.},
  review = 	 {fbie: rejected <2016-01-14 11:11:55>},
}

@article{Bernecky:1993:RAJ:166198.166201,
  author =	 {Bernecky, Robert},
  title =	 {The Role of APL and J in High-performance
                  Computation},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {Aug. 1993},
  volume =	 24,
  number =	 1,
  month =	 sep,
  year =	 1993,
  issn =	 {0163-6006},
  pages =	 {17--32},
  numpages =	 16,
  url =		 {http://doi.acm.org/10.1145/166198.166201},
  doi =		 {10.1145/166198.166201},
  acmid =	 166201,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {Although multicomputers are becoming feasible for solving large problems, they are difficult to program: Extraction of parallelism from scalar languages is possible, but limited. Parallelism in algorithm design is difficult for those who think in von Neumann terms. Portability of programs and programming skills can only be achieved by hiding the underlying machine architecture from the user, yet this may impact performance on a specific host.APL, J, and other applicative array languages with adequately rich semantics can do much to solve these problems. The paper discusses the value of abstraction and semantic richness, performance issues, portability, potential degree of parallelism, data distribution, process creation, communication and synchronization, frequency of program faults, and clarity of expression. The BLAS are used as a basis for comparison with traditional supercomputing languages.},
  notes = 	 {Review paper, interesting for motivation, but less interesting for the review.},
  review = 	 {fbie: rejected <2016-01-14 11:16:58>},
}

@inproceedings{Bernecky:1993:RAJ:166197.166201,
  author =	 {Bernecky, Robert},
  title =	 {The Role of APL and J in High-performance
                  Computation},
  booktitle =	 {Proceedings of the International Conference on APL},
  series =	 {APL '93},
  year =	 1993,
  isbn =	 {0-89791-612-3},
  location =	 {Toronto, Ontario, Canada},
  pages =	 {17--32},
  numpages =	 16,
  url =		 {http://doi.acm.org/10.1145/166197.166201},
  doi =		 {10.1145/166197.166201},
  acmid =	 166201,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {Although multicomputers are becoming feasible for solving large problems, they are difficult to program: Extraction of parallelism from scalar languages is possible, but limited. Parallelism in algorithm design is difficult for those who think in von Neumann terms. Portability of programs and programming skills can only be achieved by hiding the underlying machine architecture from the user, yet this may impact performance on a specific host.APL, J, and other applicative array languages with adequately rich semantics can do much to solve these problems. The paper discusses the value of abstraction and semantic richness, performance issues, portability, potential degree of parallelism, data distribution, process creation, communication and synchronization, frequency of program faults, and clarity of expression. The BLAS are used as a basis for comparison with traditional supercomputing languages.},
  review = 	 {fbie: rejected <2016-01-14 11:17:23>},
}

@article{Lei:2008:VCI:2509420.2512430,
  author =	 {Lei, Yuanwu and Dou, Yong and Guo, Lei and Xu, Jinbo
                  and Zhou, Jie and Dong, Yazhuo and Li, Hongjian},
  title =	 {VLIW Coprocessor for IEEE-754 Quadruple-precision
                  Elementary Functions},
  journal =	 {ACM Trans. Archit. Code Optim.},
  issue_date =	 {September 2013},
  volume =	 10,
  number =	 3,
  month =	 sep,
  year =	 2008,
  issn =	 {1544-3566},
  pages =	 {12:1--12:22},
  articleno =	 12,
  numpages =	 22,
  url =		 {http://doi.acm.org/10.1145/2512430},
  doi =		 {10.1145/2512430},
  acmid =	 2512430,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {QP_VELP, Quadruple precision floating-point,
                  elementary function, very long instruction word},
  abstract = 	 {In this article, a unified VLIW coprocessor, based on a common group of atomic operation units, for Quad arithmetic and elementary functions (QP_VELP) is presented. The explicitly parallel scheme of VLIW instruction and Estrin's evaluation scheme for polynomials are used to improve the performance. A two-level VLIW instruction RAM scheme is introduced to achieve high scalability and customizability, even for more complex key program kernels. Finally, the Quad arithmetic accelerator (QAA) with the QP_VELP array is implemented on ASIC. Compared with hyper-thread software implementation on an Intel Xeon E5620, QAA with 8 QP_VELP units achieves improvement by a factor of 18X.},
  review = 	 {fbie: rejected <2016-01-14 11:17:35>},
}

@inproceedings{Sabety:1984:SGP:800033.800835,
  author =	 {Sabety, Theodore M. and Shaw, David Elliot and
                  Mathies, Brian},
  title =	 {The Semi-automatic Generation of Processing Element
                  Control Paths for Highly Parallel Machines},
  booktitle =	 {Proceedings of the 21st Design Automation
                  Conference},
  series =	 {DAC '84},
  year =	 1984,
  isbn =	 {0-8186-0542-1},
  location =	 {Albuquerque, New Mexico, USA},
  pages =	 {441--446},
  numpages =	 6,
  url =		 {http://dl.acm.org/citation.cfm?id=800033.800835},
  acmid =	 800835,
  publisher =	 {IEEE Press},
  address =	 {Piscataway, NJ, USA},
  abstract = 	 {This paper describes a recently implemented program that very rapidly generates control paths for different variants of the constituent processing elements of a particular massively parallel machine; the NON-VON Supercomputer. The program, called PLATO, accepts as input a set of instruction opcodes, together with associated control information, and produces as output a functionally correct, highly area-efficient set of PLA's for the processing elements. One novel aspect of the program is its use of a channel routing algorithm to generate a Weinberger Array layout for the OR-plane of the PLA. By supporting extremely rapid generation of processing elements with different instruction sets, PLATO facilitates “rapid turnaround” architectural experimentation of a sort that would otherwise be impractical. Use of the program has already yielded major area and performance improvements in the NON-VON processing element. Many of the techniques employed in the PLATO system should prove applicable to the semi-automatic layout of processing elements for other multiprocessor machines.},
  review = 	 {fbie: rejected <2016-01-14 11:17:57>},
}

@article{Brown:2001:CIP:377769.377774,
  author =	 {Brown, Angela Demke and Mowry, Todd C. and Krieger,
                  Orran},
  title =	 {Compiler-based I/O Prefetching for Out-of-core
                  Applications},
  journal =	 {ACM Trans. Comput. Syst.},
  issue_date =	 {May 2001},
  volume =	 19,
  number =	 2,
  month =	 may,
  year =	 2001,
  issn =	 {0734-2071},
  pages =	 {111--170},
  numpages =	 60,
  url =		 {http://doi.acm.org/10.1145/377769.377774},
  doi =		 {10.1145/377769.377774},
  acmid =	 377774,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {compiler optimization, prefetching, virtual memory},
  abstract = 	 {Current operating systems offer poor performance when a numeric application's working set does not fit in main memory. As a result, programmers who wish to solve “out-of-core” problems efficiently are typically faced with the onerous task of rewriting an application to use explicit I/O operations (e.g., read/write). In this paper, we propose and evaluate a fully automatic technique which liberates the programmer from this task, provides high performance, and requires only minimal changes to current operating systems. In our scheme the compiler provides the crucial information on future access patterns without burdening the programmer; the operating system supports nonbinding prefetch and release hints for managing I/O; and the operating  systems cooperates with a run-time layer to accelerate performance by adapting to dynamic behavior and minimizing prefetch overhead. This approach maintains the abstraction of unlimited virtual memory for the programmer, gives the compiler the flexibility to aggressively insert prefetches ahead of references, and gives the operating system the flexibility to arbitrate between the competing resource demands of multiple applications. We implemented our compiler analysis within the SUIF compiler, and used it to target implementations of our run-time and OS support on both research and commercial systems (Hurricane and IRIX 6.5, respectively). Our experimental results show large performance gains for out-of-core scientific applications on both systems: more than 5\% of the I/O stall time has  been eliminated in most cases, thus translating into overall speedups of roughly twofold in many cases.},
  review = 	 {fbie: rejected <2016-01-14 11:18:33>},
}

@article{Eusebi:1986:AAS:22008.22051,
  author =	 {Eusebi, Ed and Brown, James A.},
  title =	 {APL2 and AI: A Study of Search},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 1986,
  volume =	 16,
  number =	 4,
  month =	 may,
  year =	 1986,
  issn =	 {0163-6006},
  pages =	 {295--300},
  numpages =	 6,
  url =		 {http://doi.acm.org/10.1145/22008.22051},
  doi =		 {10.1145/22008.22051},
  acmid =	 22051,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {The APL2 language and its environment offers a number of features which make it an effective vehicle for AI research and for the development of knowledge-based systems. In the language, general arrays can easily model various knowledge representations; it is easy to model parallel behavior; there are various ways to implement deductive reasoning: and there are some interesting features that support a functional style of programming. In the environment, editors, interactive graphics, and debugging tools provide excellent human interfaces; Fortran, REXX, and Assembler routines may transparently replace APL2 functions; relational tables which are subsets of APL2 arrays may be retrieved dynamically; and user-to-user shared variable processing can be used to model parallel behavior.
In this paper, we will demonstrate the usage of APL2 for AI problems by presenting concise functional programs for searching a database of rules and facts. Various simple enhancements are shown which extend the scope of the programs to and/or, heuristic, bi-directional, and parallel search.},
  review = 	 {fbie: rejected <2016-01-14 11:18:45>},
}

@inproceedings{Eusebi:1986:AAS:22415.22051,
  author =	 {Eusebi, Ed and Brown, James A.},
  title =	 {APL2 and AI: A Study of Search},
  booktitle =	 {Proceedings of the International Conference on APL},
  series =	 {APL '86},
  year =	 1986,
  isbn =	 {0-901865-35-4},
  location =	 {Mancheester, United Kingdom},
  pages =	 {295--300},
  numpages =	 6,
  url =		 {http://doi.acm.org/10.1145/22415.22051},
  doi =		 {10.1145/22415.22051},
  acmid =	 22051,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {The APL2 language and its environment offers a number of features which make it an effective vehicle for AI research and for the development of knowledge-based systems. In the language, general arrays can easily model various knowledge representations; it is easy to model parallel behavior; there are various ways to implement deductive reasoning: and there are some interesting features that support a functional style of programming. In the environment, editors, interactive graphics, and debugging tools provide excellent human interfaces; Fortran, REXX, and Assembler routines may transparently replace APL2 functions; relational tables which are subsets of APL2 arrays may be retrieved dynamically; and user-to-user shared variable processing can be used to model parallel behavior.
In this paper, we will demonstrate the usage of APL2 for AI problems by presenting concise functional programs for searching a database of rules and facts. Various simple enhancements are shown which extend the scope of the programs to and/or, heuristic, bi-directional, and parallel search.},
  review = 	 {fbie: rejected <2016-01-14 11:18:48>},
}

@inproceedings{Kim:1997:VAM:522659.825642,
  author =	 {Kim, Soohong P. and Hoare, Raymond R. and Dietz,
                  Henry G.},
  title =	 {VLIW Across Multiple Superscalar Processors on a
                  Single Chip},
  booktitle =	 {Proceedings of the 1997 International Conference on
                  Parallel Architectures and Compilation Techniques},
  series =	 {PACT '97},
  year =	 1997,
  isbn =	 {0-8186-8090-3},
  pages =	 {166--},
  url =		 {http://dl.acm.org/citation.cfm?id=522659.825642},
  acmid =	 825642,
  publisher =	 {IEEE Computer Society},
  address =	 {Washington, DC, USA},
  abstract = 	 {Advances in IC technology increase the integration density for higher clock rates and provide more opportunities for microprocessor design. In this paper, we propose a new paradigm to exploit instruction-level parallelism (ILP) across multiple superscalar processors on a single chip by taking advantages of both VLIW-style static scheduling techniques and dynamic scheduling of superscalar architecture. In the proposed paradigm, ILP is exploited by a compiler from a sequential program and this VLIW-like-parallelized code is further parallelized by 2-way superscalar engines at run-time. Superscalar processors are connected by an aggregate function network, which can enforce the necessary static timing constraints and provide appropriate inter-processor data communication mechanisms that are needed for ILP. The aggregate function operations are statically scheduled and implement not only fine-grain communication and control, but also simple global computations resembling systolic array operations within the network.},
  review = 	 {fbie: rejected <2016-01-14 11:19:18>},
}

@inproceedings{Corbal:2002:TMV:774861.774878,
  author =	 {Corbal, Jesus and Espasa, Roger and Valero, Mateo},
  title =	 {Three-dimensional Memory Vectorization for High
                  Bandwidth Media Memory Systems},
  booktitle =	 {Proceedings of the 35th Annual ACM/IEEE
                  International Symposium on Microarchitecture},
  series =	 {MICRO 35},
  year =	 2002,
  isbn =	 {0-7695-1859-1},
  location =	 {Istanbul, Turkey},
  pages =	 {149--160},
  numpages =	 12,
  url =		 {http://dl.acm.org/citation.cfm?id=774861.774878},
  acmid =	 774878,
  publisher =	 {IEEE Computer Society Press},
  address =	 {Los Alamitos, CA, USA},
  abstract = 	 {Vector processors have good performance, cost and adaptability when targeting multimedia applications. However, for a significant number of media programs, conventional memory configurations fail to deliver enough memory references per cycle to feed the SIMD functional units. This paper addresses the problem of the memory bandwidth.We propose a novel mechanism suitable for 2-dimensional vector architectures and targeted at providing high effective bandwidth for SIMD memory instructions. The basis of this mechanism is the extension of the scope of vectorization at the memory level, so that 3-dimensional memory patterns can be fetched into a second-level register file.By fetching long blocks of data and by reusing 2-dimensional memory streams at this second-level register file, we obtain a significant increase in the effective memory bandwidth. As side benefits, the new 3-dimensional load instructions provide a high robustness to memory latency and a significant reduction of the cache activity, thus reducing power and energy requirements. At the investment of a 5\% more area than a regular SIMD register file, we have measured and average speed-up of 1\% and the potential for power savings in the L2 cache of a 3\%.},
  review = 	 {fbie: rejected <2016-01-14 11:19:55>},
}

@inproceedings{Heule:2015:MCM:2786805.2786875,
  author =	 {Heule, Stefan and Sridharan, Manu and Chandra,
                  Satish},
  title =	 {Mimic: Computing Models for Opaque Code},
  booktitle =	 {Proceedings of the 2015 10th Joint Meeting on
                  Foundations of Software Engineering},
  series =	 {ESEC/FSE 2015},
  year =	 2015,
  isbn =	 {978-1-4503-3675-8},
  location =	 {Bergamo, Italy},
  pages =	 {710--720},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/2786805.2786875},
  doi =		 {10.1145/2786805.2786875},
  acmid =	 2786875,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {JavaScript, MCMC, Opaque code, model generation,
                  program synthesis},
  abstract = 	 {Opaque code, which is executable but whose source is unavailable or hard to process, can be problematic in a number of scenarios, such as program analysis. Manual construction of models is often used to handle opaque code, but this process is tedious and error-prone. (In this paper, we use model to mean a representation of a piece of code suitable for program analysis.) We present a novel technique for automatic generation of models for opaque code, based on program synthesis. The technique intercepts memory accesses from the opaque code to client objects, and uses this information to construct partial execution traces. Then, it performs a heuristic search inspired by Markov Chain Monte Carlo techniques to discover an executable code model whose behavior matches the opaque code. Native execution, parallelization, and a carefully-designed fitness function are leveraged to increase the effectiveness of the search. We have implemented our technique in a tool Mimic for discovering models of opaque JavaScript functions, and used Mimic to synthesize correct models for a variety of array-manipulating routines.},
  review = 	 {fbie: rejected <2016-01-14 11:20:22>},
}

@article{Baker:1990:EIB:121989.121991,
  author =	 {Baker, Henry G.},
  title =	 {Efficient Implementation of Bit-vector Operation in
                  Common Lisp},
  journal =	 {SIGPLAN Lisp Pointers},
  issue_date =	 {April-June 1990},
  volume =	 {III},
  number =	 {2-4},
  month =	 apr,
  year =	 1990,
  issn =	 {1045-3563},
  pages =	 {8--22},
  numpages =	 15,
  url =		 {http://doi.acm.org/10.1145/121989.121991},
  doi =		 {10.1145/121989.121991},
  acmid =	 121991,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {In this paper we show various techniques for the efficient implementation of the various functions of Common Lisp involving bit-vectors and bit-arrays. Bit-vectors are extremely useful for computing everything from the Sieve of Eratosthenes for finding prime numbers, to the representation of sets and relations, to the implementation of natural language parsers, to the performance of flow analysis in an optimizing compiler, to the manipulation of complex communication codes like those used in facsimile machines. However, the efficient manipulation of bit-vectors on modern computers represents a curious point on the spectrum of data processing tasks. On the one hand, the possibility of packing many bits within a single computer word and operating on all of them in parallel offers a potential for speedup not usually available for other types of tasks. On the other hand, the lack of the ability to efficiently manipulate single bits because of addressing schemes tuned for larger objects can actually reduce the speed of operating on bits. As a result of these observations, it should be obvious that no simple, automatic techniques such as "in-lining" (procedure integration) or "loop unrolling" of the obvious serial algorithms will produce the kinds of efficiency we are seeking. For these reasons, the efficient implementation of bit-vector operations requires special-case code, and is an interesting challenge in ingenuity and engineering.},
  review = 	 {fbie: rejected <2016-01-14 11:44:23>},
}

@article{Asthana:1994:EAM:190787.190796,
  author =	 {Asthana, Abhaya and Cravatts, Mark and Krzyzanowski,
                  Paul},
  title =	 {An Experimental Active Memory Based I/O Subsystem},
  journal =	 {SIGARCH Comput. Archit. News},
  issue_date =	 {Sept. 1994},
  volume =	 22,
  number =	 4,
  month =	 sep,
  year =	 1994,
  issn =	 {0163-5964},
  pages =	 {29--34},
  numpages =	 6,
  url =		 {http://doi.acm.org/10.1145/190787.190796},
  doi =		 {10.1145/190787.190796},
  acmid =	 190796,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {We describe an I/O subsystem based on an active memory called SWIM, designed for efficient storage and manipulation of data structures. The key architectural idea in SWIM is to put some processing logic inside each memory chip that allows it to perform data manipulation operations locally and to communicate with a disk or a communication line through a backend port. The processing logic is specially designed to perform operations such as pointer dereferencing, memory indirection, searching and bounds checking efficiently. The I/O subsystem is built using an interconnected ensemble of such memory logic pairs. This allows a complex I/O task to be distributed between a large number of small memory processors each doing a sub-task, while still retaining a common locus of control for higher level functions. This enables more powerful, scalable and robust designs for storage and communications subsystems that can support emerging network services, multimedia workstations and wireless PCS systems. A complete parallel hardware and software system constructed using an array of SWIM elements has been operational for over a year. We present the application of SWIM to three network functions that we have currently implemented: a national phone database server, a high performance IP router, and a call screening agent.},
  review = 	 {fbie: rejected <2016-01-14 11:44:31>},
}

@article{Fordyce:1990:FCR:122382.122383,
  author =	 {Fordyce, Kenneth and Jantzen, Jan and Sullivan,
                  Gerald},
  title =	 {Finding Circular Relationships in Networks},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {Dec. 1990},
  volume =	 21,
  number =	 2,
  month =	 dec,
  year =	 1990,
  issn =	 {0163-6006},
  pages =	 {1--4},
  numpages =	 4,
  url =		 {http://doi.acm.org/10.1145/122382.122383},
  doi =		 {10.1145/122382.122383},
  acmid =	 122383,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {A critical computational requirement for many of the decision technologies in the fields of MS/OR, AI/KBS, and DSS is the development and manipulation of a network describing the relationship between "actors" involved in the application of the decision technology to a specific problem. The manipulation of such networks using Boolean arrays and functions is well known in the APL community (see bibliography). Often in such networks it is important to identify circular conditions as a preprocessing step, but the techniques to accomplish often yield incomplete information. This paper describes a simple and efficient method to find all circular conditions as a preprocessing step.This paper is a subset of a longer paper (Fordyce, Jantzen, and Sullivan, 1990) which describes how we can fully build and manipulate a function network with Boolean arrays including focusing networks, finding circular conditions, and grouping functions based on relative independence to identify parallel computational opportunities and substantially reduces the non-procedural aspect of the problem.},
  review = 	 {fbie: rejected <2016-01-14 11:44:50>},
}

@inproceedings{Dong:2010:XM:2020373.2020382,
  author =	 {Dong, Han and Zhou, Shujia and Grove, David},
  title =	 {X10-enabled MapReduce},
  booktitle =	 {Proceedings of the Fourth Conference on Partitioned
                  Global Address Space Programming Model},
  series =	 {PGAS '10},
  year =	 2010,
  isbn =	 {978-1-4503-0461-0},
  location =	 {New York, New York, USA},
  pages =	 {9:1--9:6},
  articleno =	 9,
  numpages =	 6,
  url =		 {http://doi.acm.org/10.1145/2020373.2020382},
  doi =		 {10.1145/2020373.2020382},
  acmid =	 2020382,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {MapReduce, PGAS, across-node-communication},
  abstract = 	 {The MapReduce framework has become a popular and powerful tool to process large datasets in parallel over a cluster of computing nodes [1]. Currently, there are many flavors of implementations of MapReduce, among which the most popular is the Hadoop implementation in Java [5]. However, these implementations either rely on third-party file systems for across-computer-node communication or are difficult to implement with socket programming or communication libraries such as MPI. To address these challenges, we investigated utilizing the X10 language to implement MapReduce and tested it with the word-count use case. The key performance factor in implementing MapReduce is data moving across different computer nodes. Since X10 has built-in functions for across-node communication such as distributed arrays [2], a major challenge with MapReduce implementations is easily solved. We tested two main implementations: the first utilizes the HashMap data structure and the second a Rail with elements consisting of a string and integer pair. The performance of these two implementations are analyzed and discussed.},
  review = 	 {fbie: rejected <2016-01-14 11:45:37>},
}

@inproceedings{Garcia:1996:DDD:369028.369048,
  author =	 {Garcia, Jordi and Ayguade, Eduard and Labarta,
                  Jesus},
  title =	 {Dynamic Data Distribution with Control Flow
                  Analysis},
  booktitle =	 {Proceedings of the 1996 ACM/IEEE Conference on
                  Supercomputing},
  series =	 {Supercomputing '96},
  year =	 1996,
  isbn =	 {0-89791-854-1},
  location =	 {Pittsburgh, Pennsylvania, USA},
  articleno =	 11,
  url =		 {http://dx.doi.org/10.1145/369028.369048},
  doi =		 {10.1145/369028.369048},
  acmid =	 369048,
  publisher =	 {IEEE Computer Society},
  address =	 {Washington, DC, USA},
  review = 	 {fbie: rejected <2016-01-14 11:45:42>},
}

@inproceedings{Li:1997:EHC:509593.509610,
  author =	 {Li, K. Gary and Zamel, Nabil M.},
  title =	 {An Evaluation of HPF Compilers and the
                  Implementation of a Parallel Linear Equation Solver
                  Using HPF and MPI},
  booktitle =	 {Proceedings of the 1997 ACM/IEEE Conference on
                  Supercomputing},
  series =	 {SC '97},
  year =	 1997,
  isbn =	 {0-89791-985-8},
  location =	 {San Jose, CA},
  pages =	 {1--18},
  numpages =	 18,
  url =		 {http://doi.acm.org/10.1145/509593.509610},
  doi =		 {10.1145/509593.509610},
  acmid =	 509610,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {In this work, we evaluated the capabilities and performances of two commercially available HPF compilers, xlhpf[1] from IBM and pghpf[2] from the Portland Group. In particular, we examined the suitability of the two compilers for the development of a reservoir simulator. Because of the nature of reservoir simulation, multiple data distributions and data transfer between arrays of different data layouts are of great importance. An HPF compiler that does not provide these capabilities is unsuitable for the development of a parallel reservoir simulator. A detailed comparison of the functionalities of the two compilers and their suitabilities for reservoir simulator development are presented.To test the performance of the compilers, we used a parallel linear equation solver[3,4] that was developed to run on the CM5 and the Cray YMP. This solver is an important computational kernel in an in-house MPP reservoir simulator that is written in CM Fortran. Because the solver consumes about 9\% of the CPU time in a typical reservoir simulation run with more than a million grid blocks, it can be used as an indicator of the performance of the simulator without having to convert it in its entirety. The solver was based on preconditioned Orthomin[5,6] and a truncated Neumann series preconditioner. Unlike solvers that are based on domain decomposition[7,8], the convergence behavior of this solver does not depend on the number of processors and therefore can scale better than domain decomposition based methods when the number of processors increases. This linear equation solver was found to be robust and efficient for the simulation of the giant oil reservoirs in Saudi Arabia and has been used in several field scale simulation studies with good success[4].The parallel solver was first implemented in HPF[9] and later in MPI[10] using hpf_local to improve the performance. Communication patterns of the solver include nearest neighbor communication, reduction function and general gather/scatter for handling the implicit bottom-hole pressure term. The solver has most of the features of a reservoir simulator to test the inter-processor communication bandwidth and floating point performance of the target machines. Its performance was compared for several MPP machines: CM5, IBM SP2, Cray T-3E and Cray Origin. The advantages and disadvantages of HPF and MPI are discussed from a programming as well as performance point of view.},
  review = 	 {fbie: rejected <2016-01-14 11:46:13>},
}

@article{Davis:2005:CPS:1105734.1105740,
  author =	 {Davis, John D. and Richardson, Stephen E. and
                  Charitsis, Charis and Olukotun, Kunle},
  title =	 {A Chip Prototyping Substrate: The Flexible
                  Architecture for Simulation and Testing (FAST)},
  journal =	 {SIGARCH Comput. Archit. News},
  issue_date =	 {November 2005},
  volume =	 33,
  number =	 4,
  month =	 nov,
  year =	 2005,
  issn =	 {0163-5964},
  pages =	 {34--43},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/1105734.1105740},
  doi =		 {10.1145/1105734.1105740},
  acmid =	 1105740,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {We describe a hybrid hardware emulation environment: the Flexible Architecture for Simulation and Testing (FAST). FAST integrates field-programmable gate arrays (FPGAs), microprocessors, and memory to enable rapid prototyping of chip multiprocessors, multithreaded architectures, or other novel computer architectures and chip-level memory systems. FAST combines configurable and fixed-function hardware and software to facilitate rapid prototyping by utilizing components optimized for their particular tasks: FPGAs for interconnect and glue logic; processors for rapid program execution; and SRAMs for fast memory. Unlike software simulators, FAST can simulate complex designs at multi-megahertz speeds regardless of the simulation detail. We illustrate FAST's utility by describing mappings of both a small-scale CMP with speculation support and a large-scale CMP connected using a network. We then show performance results from a very simple, decoupled 4-way CMP executing small test programs.},
  review = 	 {fbie: rejected <2016-01-14 11:46:22>},
}

@article{Sullivan:1985:AID:255315.255347,
  author =	 {Sullivan, Gary and Fordyce, Kenneth},
  title =	 {Artificial Intelligence Development Aids (AIDA)},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {May 12, 1985},
  volume =	 15,
  number =	 4,
  month =	 may,
  year =	 1985,
  issn =	 {0163-6006},
  pages =	 {106--113},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/255315.255347},
  doi =		 {10.1145/255315.255347},
  acmid =	 255347,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {AIDA is a suite of APL based functions, and an approach, to assist in the development of modules to perform artificial intelligence (AI) tasks within a computer based information system (CBIS) or decision support system (DSS). The AIDA functions provide four basic capabilities:

string processing or character manipulation
Boolean knowledge representation
predicate logic like knowledge representation
a Boolean array based inference mechanism which helps identify parallel computation paths, has reversibility, and appears to improve performance by reducing rule visits

This paper will describe some of the functions in AIDA, and briefly describe some of the systems developed with the help of AIDA. This should help demonstrate that APL is an excellent language for development of expert systems.},
  review = 	 {fbie: rejected <2016-01-14 11:46:30>},
}

@inproceedings{Sullivan:1985:AID:17701.255347,
  author =	 {Sullivan, Gary and Fordyce, Kenneth},
  title =	 {Artificial Intelligence Development Aids (AIDA)},
  booktitle =	 {Proceedings of the International Conference on APL:
                  APL and the Future},
  series =	 {APL '85},
  year =	 1985,
  isbn =	 {0-897-91157-1},
  location =	 {Seattle, Washington, USA},
  pages =	 {106--113},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/17701.255347},
  doi =		 {10.1145/17701.255347},
  acmid =	 255347,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {AIDA is a suite of APL based functions, and an approach, to assist in the development of modules to perform artificial intelligence (AI) tasks within a computer based information system (CBIS) or decision support system (DSS). The AIDA functions provide four basic capabilities:

string processing or character manipulation
Boolean knowledge representation
predicate logic like knowledge representation
a Boolean array based inference mechanism which helps identify parallel computation paths, has reversibility, and appears to improve performance by reducing rule visits

This paper will describe some of the functions in AIDA, and briefly describe some of the systems developed with the help of AIDA. This should help demonstrate that APL is an excellent language for development of expert systems.},
  review = 	 {fbie: rejected <2016-01-14 11:46:33>},
}

@inproceedings{Ureche:2012:SCS:2103746.2103762,
  author =	 {Ureche, Vlad and Rompf, Tiark and Sujeeth, Arvind
                  and Chafi, Hassan and Odersky, Martin},
  title =	 {StagedSAC: A Case Study in Performance-oriented DSL
                  Development},
  booktitle =	 {Proceedings of the ACM SIGPLAN 2012 Workshop on
                  Partial Evaluation and Program Manipulation},
  series =	 {PEPM '12},
  year =	 2012,
  isbn =	 {978-1-4503-1118-2},
  location =	 {Philadelphia, Pennsylvania, USA},
  pages =	 {73--82},
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/2103746.2103762},
  doi =		 {10.1145/2103746.2103762},
  acmid =	 2103762,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {DSL, SAC, domain specific languages, optimization,
                  single assignment c, staging},
  abstract = 	 {Domain-specific languages (DSLs) can bridge the gap between high-level programming and efficient execution. However, implementing compiler tool-chains for performance oriented DSLs requires significant effort. Recent research has produced methodologies and frameworks that promise to reduce this development effort by enabling quick transition from library-only, purely embedded DSLs to optimizing compilation. In this case study we report on our experience implementing a compiler for StagedSAC. StagedSAC is a DSL for arithmetic processing with multidimensional arrays modeled after the stand-alone language SAC (Single Assignment C). The main language feature of both SAC and StagedSAC is a loop construction that enables high-level and concise implementations of array algorithms. At the same time, the functional semantics of the two languages allow for advanced compiler optimizations and parallel code generation. We describe how we were able to quickly evolve from a pure library DSL to a performance-oriented compiler with a good speedup and only minor syntax changes using the technique of Lightweight Modular Staging. We also describe the optimizations we perform to obtain fast code and how we plan to generate parallel code with minimal effort using the Delite framework.},
  notes = 	 {Last sentence of abstract seems interesting.},
  review = 	 {fbie: accepted <2016-01-14 11:48:40>},
}

@inproceedings{Gusev:1994:NDA:190271.190286,
  author =	 {Gusev, Dmitri and Pospelov, Igor},
  title =	 {New Development of APL Technology of Modelling:
                  APL*Plus + C++ Compiler},
  booktitle =	 {Proceedings of the International Conference on APL :
                  The Language and Its Applications: The Language and
                  Its Applications},
  series =	 {APL '94},
  year =	 1994,
  isbn =	 {0-89791-675-1},
  location =	 {Antwerp, Belgium},
  pages =	 {68--73},
  numpages =	 6,
  url =		 {http://doi.acm.org/10.1145/190271.190286},
  doi =		 {10.1145/190271.190286},
  acmid =	 190286,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {APL*PLUS, C++, compiler, machine code, mathematical
                  modelling, object-oriented programming},
  abstract = 	 {An abstract is not available.},
  review = 	 {fbie: rejected <2016-01-14 11:53:59>},
}

@article{Gusev:1994:NDA:190468.190286,
  author =	 {Gusev, Dmitri and Pospelov, Igor},
  title =	 {New Development of APL Technology of Modelling:
                  APL*Plus + C++ Compiler},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {Sept. 1994},
  volume =	 25,
  number =	 1,
  month =	 aug,
  year =	 1994,
  issn =	 {0163-6006},
  pages =	 {68--73},
  numpages =	 6,
  url =		 {http://doi.acm.org/10.1145/190468.190286},
  doi =		 {10.1145/190468.190286},
  acmid =	 190286,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {APL*PLUS, C++, compiler, machine code, mathematical
                  modelling, object-oriented programming},
  abstract = 	 {An abstract is not available.},
  review = 	 {fbie: rejected <2016-01-14 11:54:04>},
}

@article{Bernecky:1991:GR:114055.114059,
  author =	 {Bernecky, Robert and Hui, Roger K. W.},
  title =	 {Gerunds and Representations},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {Aug. 1991},
  volume =	 21,
  number =	 4,
  month =	 jul,
  year =	 1991,
  issn =	 {0163-6006},
  pages =	 {39--46},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/114055.114059},
  doi =		 {10.1145/114055.114059},
  acmid =	 114059,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {Gerunds, verbal forms that can be used as nouns, are recognized as having utility in the realm of programming languages. We show that gerunds can be viewed as arrays of atomic representations of verbs (functions), in a way which is consistent with the syntax and semantics of APL, and which allows verbs to be first class objects in the language. We define derivations of verbs from gerunds in the J dialect of APL, and show how these derivations provide control structures for sequencing, selection (in the sense of generalized forms of CASE or SWITCH statements and IF/THEN/ELSE), iteration (DO UNTIL), recursion, and parallel computation (MIMD, or Multiple Instruction, Multiple Data). We conclude with alternative representations of verbs which are useful in other contexts.},
  review = 	 {fbie: rejected <2016-01-14 11:54:43>},
}

@inproceedings{Bernecky:1991:GR:114054.114059,
  author =	 {Bernecky, Robert and Hui, Roger K. W.},
  title =	 {Gerunds and Representations},
  booktitle =	 {Proceedings of the International Conference on APL
                  '91},
  series =	 {APL '91},
  year =	 1991,
  isbn =	 {0-89791-441-4},
  location =	 {Palo Alto, California, USA},
  pages =	 {39--46},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/114054.114059},
  doi =		 {10.1145/114054.114059},
  acmid =	 114059,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {Gerunds, verbal forms that can be used as nouns, are recognized as having utility in the realm of programming languages. We show that gerunds can be viewed as arrays of atomic representations of verbs (functions), in a way which is consistent with the syntax and semantics of APL, and which allows verbs to be first class objects in the language. We define derivations of verbs from gerunds in the J dialect of APL, and show how these derivations provide control structures for sequencing, selection (in the sense of generalized forms of CASE or SWITCH statements and IF/THEN/ELSE), iteration (DO UNTIL), recursion, and parallel computation (MIMD, or Multiple Instruction, Multiple Data). We conclude with alternative representations of verbs which are useful in other contexts.},
  review = 	 {fbie: rejected <2016-01-14 11:54:48>},
}

@article{Geyer-Schulz:1990:ANN:97811.97776,
  author =	 {Geyer-Schulz, Andreas and Mitl\"{o}hner, Johann and
                  Taudes, Alfred},
  title =	 {An APL-simulator of non-Von Neumann Computer
                  Architectures},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {July 1990},
  volume =	 20,
  number =	 4,
  month =	 may,
  year =	 1990,
  issn =	 {0163-6006},
  pages =	 {140--148},
  numpages =	 9,
  url =		 {http://doi.acm.org/10.1145/97811.97776},
  doi =		 {10.1145/97811.97776},
  acmid =	 97776,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {APL has a long tradition as a language for the notation of computer architectures that dates back to its use as notation of the IBM 360 system. Its powerful primitives and compactness make it an ideal tool for simulating hardware functions in order to gain insights into the functionality, the performance and the programming issues of an algorithm without the need to undergo the painstaking process of actually building the target machine and implementing the program on it. We have developed an APL2 software simulator for Non-Von Neumann computers. The basic data structure of the system is an array whose elements model a number of RAMs containing control registers, data and program code. The user can “define” his/her machine by specifying the communication network and communication primitives, the instruction set and its semantics, and the complexity measure. We demonstrate the use of the program system for studying various schemes for adaptive load sharing on multicomputers.},
  review = 	 {fbie: rejected <2016-01-14 11:55:01>},
}

@inproceedings{Geyer-Schulz:1990:ANN:97808.97776,
  author =	 {Geyer-Schulz, Andreas and Mitl\"{o}hner, Johann and
                  Taudes, Alfred},
  title =	 {An APL-simulator of non-Von Neumann Computer
                  Architectures},
  booktitle =	 {Conference Proceedings on APL 90: For the Future},
  series =	 {APL '90},
  year =	 1990,
  isbn =	 {0-89791-371-X},
  location =	 {Copenhagen, Denmark},
  pages =	 {140--148},
  numpages =	 9,
  url =		 {http://doi.acm.org/10.1145/97808.97776},
  doi =		 {10.1145/97808.97776},
  acmid =	 97776,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {APL has a long tradition as a language for the notation of computer architectures that dates back to its use as notation of the IBM 360 system. Its powerful primitives and compactness make it an ideal tool for simulating hardware functions in order to gain insights into the functionality, the performance and the programming issues of an algorithm without the need to undergo the painstaking process of actually building the target machine and implementing the program on it. We have developed an APL2 software simulator for Non-Von Neumann computers. The basic data structure of the system is an array whose elements model a number of RAMs containing control registers, data and program code. The user can “define” his/her machine by specifying the communication network and communication primitives, the instruction set and its semantics, and the complexity measure. We demonstrate the use of the program system for studying various schemes for adaptive load sharing on multicomputers.},
  review = 	 {fbie: rejected <2016-01-14 11:55:10>},
}

@article{Hsiao:2014:RDE:2666356.2594330,
  author =	 {Hsiao, Chun-Hung and Yu, Jie and Narayanasamy,
                  Satish and Kong, Ziyun and Pereira, Cristiano L. and
                  Pokam, Gilles A. and Chen, Peter M. and Flinn,
                  Jason},
  title =	 {Race Detection for Event-driven Mobile Applications},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {June 2014},
  volume =	 49,
  number =	 6,
  month =	 jun,
  year =	 2014,
  issn =	 {0362-1340},
  pages =	 {326--336},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/2666356.2594330},
  doi =		 {10.1145/2666356.2594330},
  acmid =	 2594330,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Android, causality model, concurrency, event-driven,
                  mobile application, race detection, use-free race},
  abstract = 	 {Mobile systems commonly support an event-based model of concurrent programming. This model, used in popular platforms such as Android, naturally supports mobile devices that have a rich array of sensors and user input modalities. Unfortunately, most existing tools for detecting concurrency errors of parallel programs focus on a thread-based model of concurrency. If one applies such tools directly to an event-based program, they work poorly because they infer false dependencies between unrelated events handled sequentially by the same thread. In this paper we present a race detection tool named CAFA for event-driven mobile systems. CAFA uses the causality model that we have developed for the Android event-driven system. A novel contribution of our model is that it accounts for the causal order due to the event queues, which are not accounted for in past data race detectors. Detecting races based on low-level races between memory accesses leads to a large number of false positives. CAFA overcomes this problem by checking for races between high-level operations. We discuss our experience in using CAFA for finding and understanding a number of known and unknown harmful races in open-source Android applications.},
  review = 	 {fbie: rejected <2016-01-14 11:55:16>},
}

@inproceedings{Hsiao:2014:RDE:2594291.2594330,
  author =	 {Hsiao, Chun-Hung and Yu, Jie and Narayanasamy,
                  Satish and Kong, Ziyun and Pereira, Cristiano L. and
                  Pokam, Gilles A. and Chen, Peter M. and Flinn,
                  Jason},
  title =	 {Race Detection for Event-driven Mobile Applications},
  booktitle =	 {Proceedings of the 35th ACM SIGPLAN Conference on
                  Programming Language Design and Implementation},
  series =	 {PLDI '14},
  year =	 2014,
  isbn =	 {978-1-4503-2784-8},
  location =	 {Edinburgh, United Kingdom},
  pages =	 {326--336},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/2594291.2594330},
  doi =		 {10.1145/2594291.2594330},
  acmid =	 2594330,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Android, causality model, concurrency, event-driven,
                  mobile application, race detection, use-free race},
  abstract = 	 {Mobile systems commonly support an event-based model of concurrent programming. This model, used in popular platforms such as Android, naturally supports mobile devices that have a rich array of sensors and user input modalities. Unfortunately, most existing tools for detecting concurrency errors of parallel programs focus on a thread-based model of concurrency. If one applies such tools directly to an event-based program, they work poorly because they infer false dependencies between unrelated events handled sequentially by the same thread. In this paper we present a race detection tool named CAFA for event-driven mobile systems. CAFA uses the causality model that we have developed for the Android event-driven system. A novel contribution of our model is that it accounts for the causal order due to the event queues, which are not accounted for in past data race detectors. Detecting races based on low-level races between memory accesses leads to a large number of false positives. CAFA overcomes this problem by checking for races between high-level operations. We discuss our experience in using CAFA for finding and understanding a number of known and unknown harmful races in open-source Android applications.},
  review = 	 {fbie: rejected <2016-01-14 11:55:20>},
}

@inproceedings{Prantl:2011:PCP:2148600.2148636,
  author =	 {Prantl, Adrian and Epperly, Thomas G.W. and Imam,
                  Shams},
  title =	 {Poster: Connecting PGAS and Traditional HPC
                  Languages},
  booktitle =	 {Proceedings of the 2011 Companion on High
                  Performance Computing Networking, Storage and
                  Analysis Companion},
  series =	 {SC '11 Companion},
  year =	 2011,
  isbn =	 {978-1-4503-1030-7},
  location =	 {Seattle, Washington, USA},
  pages =	 {69--70},
  numpages =	 2,
  url =		 {http://doi.acm.org/10.1145/2148600.2148636},
  doi =		 {10.1145/2148600.2148636},
  acmid =	 2148636,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {BRAID, C, C++, PGAS, SIDL, babel, chapel, compiler,
                  fortran, java, language interoperability, python},
  abstract = 	 {Chapel is a high-level parallel programming language that implements a partitioned global address space model (PGAS). Programs written in this programming model have traditionally been self-contained entities written entirely in one language. On our poster, we present BRAID, which enables Chapel programs to call functions and instantiate objects written in C, C++, Fortran 77-2008, Java and Python. Our tool creates language bindings that are binary-compatible with those generated by the Babel language interoperability tool. The scientific community maintains a large amount of code written in traditional languages. With the help of our tool, users will gain access to their existing codebase with minimal effort and through a well-defined interface. The language bindings are designed to provide a good combination of performance and flexibility (including transparent access to distributed arrays). Knowing the demands of the target audience, we support the full Babel array API. A particular contribution is that we expose Chapel's distributed data types through our interface and make them accessible to external functions implemented in traditional serial programming languages. The advantages of our approach are highlighted by benchmarks that compare the performance of pure Chapel programs with that of hybrid versions that call subroutines implemented in Babel-supported languages inside of parallel loops. We also present our vision for interoperability with other PGAS languages such as UPC and X10.},
  review = 	 {fbie: rejected <2016-01-14 11:55:47>},
}

@inproceedings{Govindarajan:2013:RDC:2495258.2495935,
  author =	 {Govindarajan, R. and Anantpur, Jayvant},
  title =	 {Runtime Dependence Computation and Execution of
                  Loops on Heterogeneous Systems},
  booktitle =	 {Proceedings of the 2013 IEEE/ACM International
                  Symposium on Code Generation and Optimization (CGO)},
  series =	 {CGO '13},
  year =	 2013,
  isbn =	 {978-1-4673-5524-7},
  pages =	 {1--10},
  numpages =	 10,
  url =		 {http://dx.doi.org/10.1109/CGO.2013.6494992},
  doi =		 {10.1109/CGO.2013.6494992},
  acmid =	 2495935,
  publisher =	 {IEEE Computer Society},
  address =	 {Washington, DC, USA},
  review = 	 {fbie: rejected <2016-01-14 11:55:53>},
}

@inproceedings{Venkat:2014:NEP:2581122.2544141,
  author =	 {Venkat, Anand and Shantharam, Manu and Hall, Mary
                  and Strout, Michelle Mills},
  title =	 {Non-affine Extensions to Polyhedral Code Generation},
  booktitle =	 {Proceedings of Annual IEEE/ACM International
                  Symposium on Code Generation and Optimization},
  series =	 {CGO '14},
  year =	 2014,
  isbn =	 {978-1-4503-2670-4},
  location =	 {Orlando, FL, USA},
  pages =	 {185:185--185:194},
  articleno =	 185,
  numpages =	 10,
  url =		 {http://doi.acm.org/10.1145/2544137.2544141},
  doi =		 {10.1145/2544137.2544141},
  acmid =	 2544141,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {code generation, inspector/executor, loop
                  coalescing, non-affine, polyhedral model},
  abstract = 	 {This paper describes a loop transformation framework that extends a polyhedral representation of loop nests to represent and transform computations with non-affine index arrays in loop bounds and subscripts via a new interface between compile-time and run-time abstractions. Polyhedra scanning code generation, which historically applies an affine mapping to the subscript expressions of the statements in a loop nest, is modified to apply non-affine mappings involving index arrays that are represented at compile time by uninterpreted functions; non-affine loop bounds involving index arrays are also represented. When appropriate, an inspector is utilized to capture the non-affine subscript mappings, and a generalized loop coalescing transformation is introduced as a non-affine transformation to support non-affine loop bounds. With this support, complex sequences of new and existing transformations can then be composed. We demonstrate the effectiveness of this framework by optimizing sparse matrix vector multiplication operations targeting GPUs for different matrix structures and parallelization strategies. This approach achieves performance that is comparable to or greater than the hand-tuned CUSP library; for two of the implementations it achieves an average 1.14× improvement over CUSP across a collection of sparse matrices, while the third performs on average within \% of CUSP.},
  notes = 	 {Targets GPUs but seems a general technique.},
  review = 	 {fbie: accepted <2016-01-14 11:59:00>},
}

@article{Bittlestone:1985:XES:255315.255364,
  author =	 {Bittlestone, Robert},
  title =	 {XPL: An Expert Systems Framework in APL},
  journal =	 {SIGAPL APL Quote Quad},
  issue_date =	 {May 12, 1985},
  volume =	 15,
  number =	 4,
  month =	 may,
  year =	 1985,
  issn =	 {0163-6006},
  pages =	 {173--180},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/255315.255364},
  doi =		 {10.1145/255315.255364},
  acmid =	 255364,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {It is hardly possible to pick up a computer journal today without seeing references to the Alvey project in the UK, the Esprit project run by the EEC, the Japanese “Fifth generation” initiative, the proliferation of knowledge-based computer companies around Stanford in California and MIT in Massachusetts, and the large amounts of money that are being poured into the development of expert systems. The references abound: as I write this article the September 1984 issue of “International Management” (1) arrives on my desk with a cover depicting a digitized human brain image on a computer screen and a caption saying “Artificial intelligence: the race to make it work for managers”. I like the use of the word “race” here; it seems to imply that this is our last chance now that the prospect of human intelligence in managers has declined asymptotically to zero.
Among the savants cited in the international Management article is Sir Clive Sinclair, no less, who has become notorious with the British Computer Society's Expert Systems Specialist Group: notorious for arriving unexpectedly at meetings and giving the rest of the audience something to think about. Sir Clive apparently said in the interview: “Once 'machines of silicon' surpass us, they will be capable of their own design. In a real sense, they will be reproductive, and silicon will have ended carbon's long monopoly. And ours too, I suppose, for we will no longer be able to deem ourselves the finest intelligence in the known universe.”
Maybe expert systems will dominate us after all — it's just that there'll be a ten-month delay between quoting your Visa number and becoming an inferior intelligence.
All this is quite fascinating since the phenomenon is very recent: before 1983 the phrase “expert systems” was largely unheard of and certainly not in itself viewed as the potential salvation of the Western world (there are as yet few hints of Eastern bloc development apart from in Hungary, where the University of Budapest, along with those of Marseilles, Edinburgh and London's Imperial College, claims to have invented PROLOG). However, by now there has been “Horizon” television coverage of the genre and there seems little doubt that expert systems are here to stay. But before I take up the theme of my title, perhaps I might be allowed to state my position on one or two related matters. First of all: “Expert systems are not the same thing as artificial intelligence”
We can see the beginnings of a bandwagon effect here in which technical namedroppers, bored of stating that “UNIX is the coming thing” or “The future of computing lies in networking” are catching up with this latest trend: “Life is just another form of domain specialism”. Expert systems are in fact a very narrowly defined class of computer program with certain in-built inference-drawing capabilities. Artificial intelligence, on the other hand, is a much wider field involving the generic search for synthetic rational life. Much of the most interesting work in artificial intelligence does not involve digital computers at all but concerns the possibility of producing artificial models of brain process, using special fabrication techniques to emulate neurons, axons and synapses. Which brings me to my second definition: “Artificial intelligence will not be implemented on digital computers”
It turns out that the use of a digital clocked device, irrespective of speed, imposes a fatal limitation on the sophistication of a genuine search for artificial intelligence, and parallel processing makes no difference. The reason is that in the brain it is the dynamic connections between at least 10 to the power 10 asynchronous independent processing elements that provide the kind of complexity that is required. The processors in the brain are called neurons and the interconnection are called axons, and there is substantial evidence to indicate that to a first approximation, almost any neuron can grow an axon to connect to almost any other neuron.
Now the number of connections between 'N' elements increases in proportion to the square of 'N'. If you have three neurons you can connect A to B, A to C, B to C, and the same in the other direction, which is valid since it corresponds to a negative signal (inhibition vs. excitation). Consequently we have 6 possibilities for the network topology: i.e. 3x2. For 4 elements we have 12 options: 4x3. If we had a mere 100 neurons the result of 100x99 is still only about 10,000. However, in the brain we are considering about 10,000,000,000 neurons. So the number of possible connections is 100,000,000,000,000,000,000 — a number of literally astronomic proportions. We haven't yet introduced the complexity of neurons which connect to more than one other neuron (the majority do), nor signal processing within the neurons themselves (estimated as a seventh order differential equation), nor chemical effects at the synapses, which are retransmission points along the axons. When you drink too much alcohol, it is primarily the synaptic effects which alter your responses. I am unconvinced that there yet exists an artificially intelligent program capable of intoxication.
If you now consider the implications of asynchronous (but not parallel) operations taking place between arbitrarily connectable elements on this scale, you begin to see that even a billion 64-bit microprocessors would make no difference at all. The emphasis in the brain is on analogue circuits in which feedback and frequency of signal are the determining factor and where there is no single location at which processing can be said to be taking place; the whole of the brain is in this sense intelligent.
When I asked the Alvey representative at a recent meeting of the Expert Systems Specialist Group (yes, Sir Clive was there too) why there was so little emphasis on neurophysiology in the Alvey program (I was uncharacteristically diplomatic as in fact there is none at all) his response was that the program concerned the “hard” sciences and engineering: the bridge to biology was too narrow for engineers to cross. I believe he was genuinely rueful. However, if your appetite is in any way whetted for “real” artificial intelligence, you had better go out and buy two rather extraordinary books. One of them is, of course, Hofstadter's “Godel Escher Bach: An Eternal Golden Braid” (2). The other is less well-known: William T. Powers's “Behaviour: The Control of Perception” (3).
In Hofstadter's book the notion of recursion is explored in many delightful ways. For those who have missed it, here is a sampler. Our heroes, Achilles and the Tortoise, have found a lamp. They rub it and a Genie appears. The Genie will grant three wishes. This prospect is greeted with glee by Achilles, who has read the 'Arabian Night' and knows the form. He makes his first wish: Achilles: “I wish that I had 100 wishes instead of just three.” Genie: “I am sorry, Achilles, but I don't grant meta-wishes.” Achilles: “I wish you'd tell me what a 'meta-wish' is.” Genie: “But that is a meta-meta-wish, Achilles, and I don't grant them either.” But after some pleading, the Genie agrees to try (meta-wishes are his favorite wishes too) and from inside his cloak he produces a lamp. He rubs it and out pops a MetaGenie: Genie: “I wish for permission for temporary suspension of all type-restrictions on wishes, for the duration of one typeless wish.” MetaGenie:“Half a moment please (produces a lamp, rubs it, etc.).” This continues for a short infinite period until we get to the top-level MetaGenie, also known as GOD, who grants permission for the typeless wish: Achilles: “I wish my wish would not be granted!” (There follows a SYSTEM CRASH: or as Hofstadter puts it, “the land of dead hiccoughs and extinguished lightbulbs”). Earlier, when quizzed on what the word GOD means, the Genie explains that it's an acronym — G.O.D. — standing for “GOD OVER DJINN” (Djinn being a generic type of intermediate Eastern Deity). Does this help? Ah yes, because we can expand it: GOD = GOD OVER DJINN = (GOD OVER DJINN) OVER DJINN = ((GOD OVER DJINN) OVER DJINN) OVER DJINN ………… If this is starting to look familiar, it's because it is. A deep knowledge of the relation of a system to its metasystem and of the closely connected notion-of recursion is an essential pre-requisite for serious work in artificial intelligence. In fact I suspect that: “Artificial intelligence = recursion + superficial incompetence” What about the other book I referred to: “Behaviour: The Control of Perception” (or as they write it in the USA, “Behavior: The Control of Prception”)? In my estimation this book ranks with Norbert Wiener's “Cybernetics” (4) and Stafford Beer's “Brain of the Firm” (5): together they form the twentieth century triumvirate in this field. Powers starts off with some deceptively simple experiments and ends up with a necessary but not sufficient model of human intelligence based on first principles and an appeal to reason. The argument is convincing and is closer than any other I have read to the true core of artificial intelligence. Significantly, I don't recall in the book a single serious mention of digital computer programming.
Powers explains that the basic circuit elements in organic brains are not digital at all. They are analogue, but they use frequency not voltage as the yardstick. If 10 neural impulses per second flow in an axon, that may cause (at the lowest level) a certain amount of contraction to be applied to a single muscle fiber. 20 impulses per second might double the contraction force. However, the key point is the perception of all this. Feedback from local sensors is compared against a reference level for the desired state, and the output signal is adjusted accordingly. This compensates for unexpected resistance to the muscle's work — and if you think about it, you've never lifted a weight by applying linear pressure in your life. Literally millions of low-level frequency sensitive feedback circuits create what Powers calls Level 1 — the interface to the outside world at the sensory/motor level.
Then we start to encounter hierarchy. Where do the reference levels for the desired states come from? Well, these are the outputs from Level 2, but this time they correspond to a higher-level notion: like that of vertical vs. horizontal movement. By positing a hierarchy of feedback circuits whose outputs become the reference levels (i.e.. control signals) for lower level circuits, Powers will lead you to agree that when you are walking at 2 miles per hour, there is an identifiable, traceable neural frequency in your brain which doubles in frequency when you walk at 4 mph. And if you find that hard to swallow, by the time you reach higher levels of hierarchy we are talking about frequencies which correspond to degrees of emotion. Of hate. Of love. To sleep, perchance to dream… “Do Androids Dream of Electric Sheep?” (6).
I suspect I have strayed somewhat from the title of this article. But I do think it's important to set the context. If you want some fun with computers, start playing with expert systems. If you want to learn something useful about artificial intelligence, start reading some books. The two are not the same and I personally doubt that insights from one much help in the assimilation of the other.
Perhaps we should focus on the main theme. This gives me a small problem since the arbitrary reader of this article exists in four configurations: you know about APL (or you don't) and you know about expert systems (or you don't). Here I am going to pitch the discussion slightly in favor of someone who knows a bit about APL but nothing about expert systems, although I hope there will be some comfort for the others here and there.
Let's recap on APL's strengths in this area. We have a language which supports recursion with no limits apart from workspace size. Local variables and arguments to recursive functions are properly masked and pushed to a stack on each recursive invocation. Logical operations in APL are generally implemented at data word bit level and are very comprehensive and very fast. Arrays of arbitrary dimension are supported and most primitive operations apply element-wise without looping. APL is a threaded interpretive language in which individual building blocks are defined and can be invoked in a free-standing manner (like LISP). If LISP suffers from parentheses, APL suffers from silly specialist symbols, but fortunately this last aspect is starting to disappear in the more enlightened implementations. APL is very good indeed at numeric computation of high complexity in both the scientific and the commercial domains, and in addition it is fast and elegant at string handling.
Standard APL disallows non-orthogonal arrays and those of mixed character and numeric type. This is undoubtedly a disadvantage for expert systems work but is overcome in APL2. However, I haven't used APL2 seriously yet, like most other APLers I suspect, and shall confine my remarks to traditional (or as my American friends would put it, “vanilla”) APL. So the question becomes: “What has PROLOG got that APL hasn't?”
The answer, surprisingly to APL aficionados, is: quite a lot. However, perhaps surprisingly to PROLOG programmers, it turns out that one can code all the standard PROLOG backtracking and similar logic capability into APL user-defined functions (i.e.. into a subroutine library). You then end up with an APL workspace oriented towards the development of expert systems, but with the great advantage of carrying with it all the “hooks” and existing facilities which have been developed in APL over the last 20 years.
Evidently PROLOG users don't mind that they have no proper filing, no error control, no serious peripheral control, no multi-process synchronization, no inter-user communication, no user or file security, no computational intrinsics beyond add, subtract, divide and multiply, no interpreter available on IBM mainframes, no links to COBOL and other languages, no screen control, no graphics capability, no links to devices like “mice” … It makes it fun to develop expert systems in PROLOG, but awfully hard to sell them. So far I keep seeing demonstrations of PROLOG expert systems telling me how to water my houseplants, but the florist down the road isn't selling the package yet.
Let's look at the framework of an expert system in APL terms. This path has been trodden before (7) but I'd like to look at the problem in a slightly different way. To make it more practical, let me tell you about an expert system I have been writing and playing with. It's called GENESIS 4. This sounds like a reasonable name for a package: GENESIS Release 4.0; but actually this is to do with the Bible (8). Not that I am particularly religious by nature; it's just that the Bible predates “Behaviour: The Control of Perception” as a functional specification of humanity by about 5000 years, so it's of interest to all artificial intelligence buffs.
All expert systems have three chunks.
The SITUATION MODEL is a whole load of information about (in this case) Adam's descendants, their marriages, their progeny and the main events in their lives.
The KNOWLEDGE BASE is a set of rules which indicate the laws of genealogy. For example, “A is offspring of B if B begat A” and so on.
The SYSTEM MANAGER is a set of programs that allows you to explore the situation model, ask it questions, and ask it to draw conclusions: for example, “Is Z descended from A?”. Clever system managers sometimes try to propose rules by looking for patterns in large amounts of data in a situation model.
That's all.
What I will do now is re-write parts of Genesis Chapter 4 as if I were entering lines into an APL-based expert system, trying to keep it as much like PROLOG as possible. The numbers in the margins are the verses.
1. “And Adam knew Eve his wife; and she conceived, and bare Cain, and said 'I have gotten a man from the Lord'.”
ADAM IS-MARRIED-TO EVE
CAIN IS-OFFSPRING-OF ADAM
CAIN IS-OFFSPRING-OF EVE
CAIN IS-SEX MALE
2. “And she again bare his brother Abel.”
ABEL IS-OFFSPRING-OF EVE
ABEL IS-SEX MALE
8. “Cain rose up against Abel his brother, and slew him.”
CAIN KILLS ABEL
16. “Cain went out from the presence of the Lord, and dwelt in the land of Nod, on the east of Eden.”
CAIN INHABITS NOD
NOD IS-EAST-OF EDEN
17. “And Cain knew his wife, and she conceived, and bare Enoch: and he builded a city, and called the name of the city, after the name of his son, Enoch. And unto Enoch was born Irad: and Irad begat Mehujael: and Mehujael began Methusael: and Methusael begat Lamech.” CAIN IS-MARRIED-TO MRS.X ENOCH IS-OFFSPRING-OF CAIN ENOCH IS-OFFSPRING-OF MRS.X CAIN BUILDS-CITY-CALLED ENOCH IRAD IS-OFFSPRING-OF ENOCH MEHUJAEL IS-OFFSPRING-OF IRAD METHUSAEL IS-OFFSPRING-OF MEHUJAEL LAMECH IS-OFFSPRING-OF METHUSAEL
There's a lot more of this sort of thing. I know that the Bible is now available in compressed text form on floppy discs as “THE WORD Processor” (9) but I'll wager that the logical inferences remain unencoded.
What can we do with APL so far? Well, we need to encode this information into APL data structures. There are many ways of doing this, of course. An obvious approach is to write a little parser which takes statements like LAMECH IS-OFFSPRING-OF METHUSAEL and divide them into their different text strings (3 in this case). We start building a text matrix with a unique instance of each name. When a statement comes in, we look up the row number of each string or we add the new string to the end of the matrix. So the sequence:
MEHUJAEL IS-OFFSPRING-OF IRAD
METHUSAEL IS-OFFSPRING-OF MEHUJAEL
LAMECH IS-OFFSPRING-OF METHUSAEL
would be represented by these two matrices:
MEHUJAEL 1 2 3
IS-OFFSPRING-OF 4 2 1
IRAD 5 2 4
METHUSAEL
LAMECH
Actually it's slightly trickier than this as we shall need a flag bit to indicate which entry is a relationship and which is a noun, and they aren't always binary relationships, and the arguments aren't always on each side. But the basic idea's the same. A simple parsing function in APL is about 5 lines of code, and if we call it ADD it really does look like PROLOG.: ADD 'LAMECH IS-OFFSPRING-OF METHUSAEL'
So far, so trivial, but we've done something epoch-making: we've defined our first function in XPL, which is the name I have chosen for this set of functions, written in APL but using ASCII character set throughout. Now we want to interrogate the database:
“Is Lamech the offspring of Methusael?” DOES 'LAMECH IS-OFFSPRING-OF METHUSAEL' TRUE
“Is Irad the offspring of Mehujael?” DOES 'IRAD IS-OFFSPRING-OF MEHUJAEL' FALSE
Funny syntax, isn't it? I'm trying to stick to PROLOG though, and this is it. Now is it mindbogglingly hard to code this in XPL? Not really: function DOES looks to be at least one line long: it has to find a particular row of integers in the matrix. Of course things get tougher in PROLOG quite quickly:
“Who is the offspring of Methusael?” WHICH 'X:X IS-OFFSPRING-OF METHUSAEL' LAMECH
“Who is the offspring of Lamech?” WHICH 'X:X IS-OFFSPRING-OF LAMECH' NO MATCH
This time not only do we have to find a particular row of integers in the numeric matrix, we also have to work out who X refers to and find him in the list of names. Fiendish stuff. Still, I think we can just about cope. Let's move onto the knowledge base. Here's some useful jargon. You and I are “knowledge engineers” in the context of our translation of the Bible into XPL format. Priests are “domain specialists”: Heaven and Hell is their domain. To this I would add that people who create expert systems languages, as we are doing at this moment, are presumably “metadomain specialists” (I believe this is an original coinage).
It gets boring telling XPL everything twice. For example:
ADD 'LAMECH IS-OFFSPRING-OF METHUSAEL' ADD 'METHUSAEL IS-PARENT-OF LAMECH'
How about defining a rule instead?
ADD 'X IS-PARENT OF Y IF Y IS-OFFSPRING-OF X'
and while we're about it:
ADD 'X IS-OFFSPRING-OF Y IF Y IS-PARENT-OF X'
Not too hard to handle either is:
ADD 'X IS-HUSBAND OF Y IF X IS-MARRIED-TO Y AND X IS-SEX MALE'
Now try:
ADD 'X IS-GRANDCHILD-OF Y IF X IS-OFFSPRING-OF Z AND Z IS-OFFSPRING-OF Y
We've introduced the concept of “place markers”, which are analogous to the local names of the arguments of functions, except that the functions are relations (data) rather than programs. There are many ways to incorporate these into XPL and your own ideas will undoubtedly be better then mine: I use negative numbers in the integer matrix to denote place markers:
MEHUJAEL 1 2 3
IS-OFFSPRING-OF 4 2 1
IRAD 5 2 4
METHUSAEL -1 6 -2
LAMECH -1 2 -3
IS-GRANDCHILD-OF -3 2 -2
The functions DOES and WHICH now become rather more interesting. Suppose we ask the question:
DOES 'ENOCH IS-GRANDCHILD-OF ADAM'
DOES now has to go scouting around to see if this is a bit of known data. If not, it goes to see if it has a rule-based definition for the grandchild relation. If it does, it has to find the first match for Enoch based on the offspring relation, and then pause while it tests whether this match in turn relates to Enoch. If not, it can resume looking for further matches for Enoch. It goes without saying that an XPL function employing recursion is the elegant (possibly the only) way to implement this process. It isn't very difficult to do.
Here's a more interesting rule:
ADD 'X IS-DESCENDANT-OF Y IF X IS-CHILD-OF Y' ADD 'X IS-DESCENDANT-OF Y IF X IS-CHILD-OF Z AND Z IS-DESCENDANT-OF Y'
Recursion groupies will see the classic syntax: the first line denotes the limiting condition, the second line forms the recursive relation. The extensive use of recursive tests based on goal-seeking requests from the user is termed “backtracking” for obvious reasons. The standard mechanism of the state indicator in APL handles such requests beautifully.
I hope that those with some APL experience can see that it is not inordinately tricky to implement these aspects of PROLOG in user-defined APL functions. After a while the imagination starts running riot:
ADD 'X PROBABLY IS-CHILD-OF Y IF X IS-CHILD-OF Z AND Y IS-MARRIED-TO Z'
I suppose there's always the milkman. Adverbs like DEFINITELY, PROBABLY, POSSIBLY, CONCEIVABLY (oops) and INCONCEIVABLY can conveniently carry probability values like 1.0, 0.75, 0.5, 0.25 and 0; if you want to change the climate of expert opinion you can tinker with the values. And don't forget NOT:
ADD 'X IS-SEX FEMALE IF X NOT IS-SEX MALE'
Well, we don't all live in San Francisco.
What about expert systems themselves? If you recall the three elements:

situation model
knowledge base
system manager

then it turns out in languages like PROLOG that the preceding constructs are used on an intermixed basis to represent data in the situation model and rules in the knowledge base. In XPL one feels it may be cleaner to keep them separate, but that's a matter for personal taste.
More importantly, most PROLOG users don't seem to have caught on to the fact that much of the data in the situation model may well be a transform of, for example, data about patients in a medical file whose records have been painstakingly composed over many years. File access to other systems is imperative; most of the facts are already there.
Furthermore, although PROLOG is good at relations, it's very bad at numerical tests. An expert system for shop floor control needs to cope with quantified tests based on quasi-real-time input from process sensors: for example:
ADD 'TURBINE SWITCH-SETTING OFF IF RPM ABOVE 5000' is as far as I know impossible to handle in PROLOG if RPM is intended to be directly linked to the real world, whereas in APL one simply makes RPM a variable shared with an auxiliary processor which handles direct data capture. For non-APLers this means: variable RPM in the APL workspace is dynamically coupled via programmer-specifiable update protocol with an arbitrary machine code segment, which is itself either handling process input/output or is talking to some other process that is. The point is that a robust and very well established framework for this kind of dynamic linking already exists in APL.
I hope I have managed to whet two kinds of appetite. If you know APL already, you may be starting to think about building your own XPL functions (mine are for private consumption and you've had numerous hints already) and writing your own expert system. If you know about expert systems and are toiling with inadequate tools, you might just think about looking at APL instead. An unlikely marriage? I don't think so. No more than Cain's, anyway; the identity of Mrs X is the theological riddle of the millenium. Good luck with such experiments: may thy domain specialties be brought forth and multiply upon the interface of the digital computer.},
  review = 	 {fbie: rejected <2016-01-14 11:59:11>},
}

@inproceedings{Bittlestone:1985:XES:17701.255364,
  author =	 {Bittlestone, Robert},
  title =	 {XPL: An Expert Systems Framework in APL},
  booktitle =	 {Proceedings of the International Conference on APL:
                  APL and the Future},
  series =	 {APL '85},
  year =	 1985,
  isbn =	 {0-897-91157-1},
  location =	 {Seattle, Washington, USA},
  pages =	 {173--180},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/17701.255364},
  doi =		 {10.1145/17701.255364},
  acmid =	 255364,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {It is hardly possible to pick up a computer journal today without seeing references to the Alvey project in the UK, the Esprit project run by the EEC, the Japanese “Fifth generation” initiative, the proliferation of knowledge-based computer companies around Stanford in California and MIT in Massachusetts, and the large amounts of money that are being poured into the development of expert systems. The references abound: as I write this article the September 1984 issue of “International Management” (1) arrives on my desk with a cover depicting a digitized human brain image on a computer screen and a caption saying “Artificial intelligence: the race to make it work for managers”. I like the use of the word “race” here; it seems to imply that this is our last chance now that the prospect of human intelligence in managers has declined asymptotically to zero.
Among the savants cited in the international Management article is Sir Clive Sinclair, no less, who has become notorious with the British Computer Society's Expert Systems Specialist Group: notorious for arriving unexpectedly at meetings and giving the rest of the audience something to think about. Sir Clive apparently said in the interview: “Once 'machines of silicon' surpass us, they will be capable of their own design. In a real sense, they will be reproductive, and silicon will have ended carbon's long monopoly. And ours too, I suppose, for we will no longer be able to deem ourselves the finest intelligence in the known universe.”
Maybe expert systems will dominate us after all — it's just that there'll be a ten-month delay between quoting your Visa number and becoming an inferior intelligence.
All this is quite fascinating since the phenomenon is very recent: before 1983 the phrase “expert systems” was largely unheard of and certainly not in itself viewed as the potential salvation of the Western world (there are as yet few hints of Eastern bloc development apart from in Hungary, where the University of Budapest, along with those of Marseilles, Edinburgh and London's Imperial College, claims to have invented PROLOG). However, by now there has been “Horizon” television coverage of the genre and there seems little doubt that expert systems are here to stay. But before I take up the theme of my title, perhaps I might be allowed to state my position on one or two related matters. First of all: “Expert systems are not the same thing as artificial intelligence”
We can see the beginnings of a bandwagon effect here in which technical namedroppers, bored of stating that “UNIX is the coming thing” or “The future of computing lies in networking” are catching up with this latest trend: “Life is just another form of domain specialism”. Expert systems are in fact a very narrowly defined class of computer program with certain in-built inference-drawing capabilities. Artificial intelligence, on the other hand, is a much wider field involving the generic search for synthetic rational life. Much of the most interesting work in artificial intelligence does not involve digital computers at all but concerns the possibility of producing artificial models of brain process, using special fabrication techniques to emulate neurons, axons and synapses. Which brings me to my second definition: “Artificial intelligence will not be implemented on digital computers”
It turns out that the use of a digital clocked device, irrespective of speed, imposes a fatal limitation on the sophistication of a genuine search for artificial intelligence, and parallel processing makes no difference. The reason is that in the brain it is the dynamic connections between at least 10 to the power 10 asynchronous independent processing elements that provide the kind of complexity that is required. The processors in the brain are called neurons and the interconnection are called axons, and there is substantial evidence to indicate that to a first approximation, almost any neuron can grow an axon to connect to almost any other neuron.
Now the number of connections between 'N' elements increases in proportion to the square of 'N'. If you have three neurons you can connect A to B, A to C, B to C, and the same in the other direction, which is valid since it corresponds to a negative signal (inhibition vs. excitation). Consequently we have 6 possibilities for the network topology: i.e. 3x2. For 4 elements we have 12 options: 4x3. If we had a mere 100 neurons the result of 100x99 is still only about 10,000. However, in the brain we are considering about 10,000,000,000 neurons. So the number of possible connections is 100,000,000,000,000,000,000 — a number of literally astronomic proportions. We haven't yet introduced the complexity of neurons which connect to more than one other neuron (the majority do), nor signal processing within the neurons themselves (estimated as a seventh order differential equation), nor chemical effects at the synapses, which are retransmission points along the axons. When you drink too much alcohol, it is primarily the synaptic effects which alter your responses. I am unconvinced that there yet exists an artificially intelligent program capable of intoxication.
If you now consider the implications of asynchronous (but not parallel) operations taking place between arbitrarily connectable elements on this scale, you begin to see that even a billion 64-bit microprocessors would make no difference at all. The emphasis in the brain is on analogue circuits in which feedback and frequency of signal are the determining factor and where there is no single location at which processing can be said to be taking place; the whole of the brain is in this sense intelligent.
When I asked the Alvey representative at a recent meeting of the Expert Systems Specialist Group (yes, Sir Clive was there too) why there was so little emphasis on neurophysiology in the Alvey program (I was uncharacteristically diplomatic as in fact there is none at all) his response was that the program concerned the “hard” sciences and engineering: the bridge to biology was too narrow for engineers to cross. I believe he was genuinely rueful. However, if your appetite is in any way whetted for “real” artificial intelligence, you had better go out and buy two rather extraordinary books. One of them is, of course, Hofstadter's “Godel Escher Bach: An Eternal Golden Braid” (2). The other is less well-known: William T. Powers's “Behaviour: The Control of Perception” (3).
In Hofstadter's book the notion of recursion is explored in many delightful ways. For those who have missed it, here is a sampler. Our heroes, Achilles and the Tortoise, have found a lamp. They rub it and a Genie appears. The Genie will grant three wishes. This prospect is greeted with glee by Achilles, who has read the 'Arabian Night' and knows the form. He makes his first wish: Achilles: “I wish that I had 100 wishes instead of just three.” Genie: “I am sorry, Achilles, but I don't grant meta-wishes.” Achilles: “I wish you'd tell me what a 'meta-wish' is.” Genie: “But that is a meta-meta-wish, Achilles, and I don't grant them either.” But after some pleading, the Genie agrees to try (meta-wishes are his favorite wishes too) and from inside his cloak he produces a lamp. He rubs it and out pops a MetaGenie: Genie: “I wish for permission for temporary suspension of all type-restrictions on wishes, for the duration of one typeless wish.” MetaGenie:“Half a moment please (produces a lamp, rubs it, etc.).” This continues for a short infinite period until we get to the top-level MetaGenie, also known as GOD, who grants permission for the typeless wish: Achilles: “I wish my wish would not be granted!” (There follows a SYSTEM CRASH: or as Hofstadter puts it, “the land of dead hiccoughs and extinguished lightbulbs”). Earlier, when quizzed on what the word GOD means, the Genie explains that it's an acronym — G.O.D. — standing for “GOD OVER DJINN” (Djinn being a generic type of intermediate Eastern Deity). Does this help? Ah yes, because we can expand it: GOD = GOD OVER DJINN = (GOD OVER DJINN) OVER DJINN = ((GOD OVER DJINN) OVER DJINN) OVER DJINN ………… If this is starting to look familiar, it's because it is. A deep knowledge of the relation of a system to its metasystem and of the closely connected notion-of recursion is an essential pre-requisite for serious work in artificial intelligence. In fact I suspect that: “Artificial intelligence = recursion + superficial incompetence” What about the other book I referred to: “Behaviour: The Control of Perception” (or as they write it in the USA, “Behavior: The Control of Prception”)? In my estimation this book ranks with Norbert Wiener's “Cybernetics” (4) and Stafford Beer's “Brain of the Firm” (5): together they form the twentieth century triumvirate in this field. Powers starts off with some deceptively simple experiments and ends up with a necessary but not sufficient model of human intelligence based on first principles and an appeal to reason. The argument is convincing and is closer than any other I have read to the true core of artificial intelligence. Significantly, I don't recall in the book a single serious mention of digital computer programming.
Powers explains that the basic circuit elements in organic brains are not digital at all. They are analogue, but they use frequency not voltage as the yardstick. If 10 neural impulses per second flow in an axon, that may cause (at the lowest level) a certain amount of contraction to be applied to a single muscle fiber. 20 impulses per second might double the contraction force. However, the key point is the perception of all this. Feedback from local sensors is compared against a reference level for the desired state, and the output signal is adjusted accordingly. This compensates for unexpected resistance to the muscle's work — and if you think about it, you've never lifted a weight by applying linear pressure in your life. Literally millions of low-level frequency sensitive feedback circuits create what Powers calls Level 1 — the interface to the outside world at the sensory/motor level.
Then we start to encounter hierarchy. Where do the reference levels for the desired states come from? Well, these are the outputs from Level 2, but this time they correspond to a higher-level notion: like that of vertical vs. horizontal movement. By positing a hierarchy of feedback circuits whose outputs become the reference levels (i.e.. control signals) for lower level circuits, Powers will lead you to agree that when you are walking at 2 miles per hour, there is an identifiable, traceable neural frequency in your brain which doubles in frequency when you walk at 4 mph. And if you find that hard to swallow, by the time you reach higher levels of hierarchy we are talking about frequencies which correspond to degrees of emotion. Of hate. Of love. To sleep, perchance to dream… “Do Androids Dream of Electric Sheep?” (6).
I suspect I have strayed somewhat from the title of this article. But I do think it's important to set the context. If you want some fun with computers, start playing with expert systems. If you want to learn something useful about artificial intelligence, start reading some books. The two are not the same and I personally doubt that insights from one much help in the assimilation of the other.
Perhaps we should focus on the main theme. This gives me a small problem since the arbitrary reader of this article exists in four configurations: you know about APL (or you don't) and you know about expert systems (or you don't). Here I am going to pitch the discussion slightly in favor of someone who knows a bit about APL but nothing about expert systems, although I hope there will be some comfort for the others here and there.
Let's recap on APL's strengths in this area. We have a language which supports recursion with no limits apart from workspace size. Local variables and arguments to recursive functions are properly masked and pushed to a stack on each recursive invocation. Logical operations in APL are generally implemented at data word bit level and are very comprehensive and very fast. Arrays of arbitrary dimension are supported and most primitive operations apply element-wise without looping. APL is a threaded interpretive language in which individual building blocks are defined and can be invoked in a free-standing manner (like LISP). If LISP suffers from parentheses, APL suffers from silly specialist symbols, but fortunately this last aspect is starting to disappear in the more enlightened implementations. APL is very good indeed at numeric computation of high complexity in both the scientific and the commercial domains, and in addition it is fast and elegant at string handling.
Standard APL disallows non-orthogonal arrays and those of mixed character and numeric type. This is undoubtedly a disadvantage for expert systems work but is overcome in APL2. However, I haven't used APL2 seriously yet, like most other APLers I suspect, and shall confine my remarks to traditional (or as my American friends would put it, “vanilla”) APL. So the question becomes: “What has PROLOG got that APL hasn't?”
The answer, surprisingly to APL aficionados, is: quite a lot. However, perhaps surprisingly to PROLOG programmers, it turns out that one can code all the standard PROLOG backtracking and similar logic capability into APL user-defined functions (i.e.. into a subroutine library). You then end up with an APL workspace oriented towards the development of expert systems, but with the great advantage of carrying with it all the “hooks” and existing facilities which have been developed in APL over the last 20 years.
Evidently PROLOG users don't mind that they have no proper filing, no error control, no serious peripheral control, no multi-process synchronization, no inter-user communication, no user or file security, no computational intrinsics beyond add, subtract, divide and multiply, no interpreter available on IBM mainframes, no links to COBOL and other languages, no screen control, no graphics capability, no links to devices like “mice” … It makes it fun to develop expert systems in PROLOG, but awfully hard to sell them. So far I keep seeing demonstrations of PROLOG expert systems telling me how to water my houseplants, but the florist down the road isn't selling the package yet.
Let's look at the framework of an expert system in APL terms. This path has been trodden before (7) but I'd like to look at the problem in a slightly different way. To make it more practical, let me tell you about an expert system I have been writing and playing with. It's called GENESIS 4. This sounds like a reasonable name for a package: GENESIS Release 4.0; but actually this is to do with the Bible (8). Not that I am particularly religious by nature; it's just that the Bible predates “Behaviour: The Control of Perception” as a functional specification of humanity by about 5000 years, so it's of interest to all artificial intelligence buffs.
All expert systems have three chunks.
The SITUATION MODEL is a whole load of information about (in this case) Adam's descendants, their marriages, their progeny and the main events in their lives.
The KNOWLEDGE BASE is a set of rules which indicate the laws of genealogy. For example, “A is offspring of B if B begat A” and so on.
The SYSTEM MANAGER is a set of programs that allows you to explore the situation model, ask it questions, and ask it to draw conclusions: for example, “Is Z descended from A?”. Clever system managers sometimes try to propose rules by looking for patterns in large amounts of data in a situation model.
That's all.
What I will do now is re-write parts of Genesis Chapter 4 as if I were entering lines into an APL-based expert system, trying to keep it as much like PROLOG as possible. The numbers in the margins are the verses.
1. “And Adam knew Eve his wife; and she conceived, and bare Cain, and said 'I have gotten a man from the Lord'.”
ADAM IS-MARRIED-TO EVE
CAIN IS-OFFSPRING-OF ADAM
CAIN IS-OFFSPRING-OF EVE
CAIN IS-SEX MALE
2. “And she again bare his brother Abel.”
ABEL IS-OFFSPRING-OF EVE
ABEL IS-SEX MALE
8. “Cain rose up against Abel his brother, and slew him.”
CAIN KILLS ABEL
16. “Cain went out from the presence of the Lord, and dwelt in the land of Nod, on the east of Eden.”
CAIN INHABITS NOD
NOD IS-EAST-OF EDEN
17. “And Cain knew his wife, and she conceived, and bare Enoch: and he builded a city, and called the name of the city, after the name of his son, Enoch. And unto Enoch was born Irad: and Irad begat Mehujael: and Mehujael began Methusael: and Methusael begat Lamech.” CAIN IS-MARRIED-TO MRS.X ENOCH IS-OFFSPRING-OF CAIN ENOCH IS-OFFSPRING-OF MRS.X CAIN BUILDS-CITY-CALLED ENOCH IRAD IS-OFFSPRING-OF ENOCH MEHUJAEL IS-OFFSPRING-OF IRAD METHUSAEL IS-OFFSPRING-OF MEHUJAEL LAMECH IS-OFFSPRING-OF METHUSAEL
There's a lot more of this sort of thing. I know that the Bible is now available in compressed text form on floppy discs as “THE WORD Processor” (9) but I'll wager that the logical inferences remain unencoded.
What can we do with APL so far? Well, we need to encode this information into APL data structures. There are many ways of doing this, of course. An obvious approach is to write a little parser which takes statements like LAMECH IS-OFFSPRING-OF METHUSAEL and divide them into their different text strings (3 in this case). We start building a text matrix with a unique instance of each name. When a statement comes in, we look up the row number of each string or we add the new string to the end of the matrix. So the sequence:
MEHUJAEL IS-OFFSPRING-OF IRAD
METHUSAEL IS-OFFSPRING-OF MEHUJAEL
LAMECH IS-OFFSPRING-OF METHUSAEL
would be represented by these two matrices:
MEHUJAEL 1 2 3
IS-OFFSPRING-OF 4 2 1
IRAD 5 2 4
METHUSAEL
LAMECH
Actually it's slightly trickier than this as we shall need a flag bit to indicate which entry is a relationship and which is a noun, and they aren't always binary relationships, and the arguments aren't always on each side. But the basic idea's the same. A simple parsing function in APL is about 5 lines of code, and if we call it ADD it really does look like PROLOG.: ADD 'LAMECH IS-OFFSPRING-OF METHUSAEL'
So far, so trivial, but we've done something epoch-making: we've defined our first function in XPL, which is the name I have chosen for this set of functions, written in APL but using ASCII character set throughout. Now we want to interrogate the database:
“Is Lamech the offspring of Methusael?” DOES 'LAMECH IS-OFFSPRING-OF METHUSAEL' TRUE
“Is Irad the offspring of Mehujael?” DOES 'IRAD IS-OFFSPRING-OF MEHUJAEL' FALSE
Funny syntax, isn't it? I'm trying to stick to PROLOG though, and this is it. Now is it mindbogglingly hard to code this in XPL? Not really: function DOES looks to be at least one line long: it has to find a particular row of integers in the matrix. Of course things get tougher in PROLOG quite quickly:
“Who is the offspring of Methusael?” WHICH 'X:X IS-OFFSPRING-OF METHUSAEL' LAMECH
“Who is the offspring of Lamech?” WHICH 'X:X IS-OFFSPRING-OF LAMECH' NO MATCH
This time not only do we have to find a particular row of integers in the numeric matrix, we also have to work out who X refers to and find him in the list of names. Fiendish stuff. Still, I think we can just about cope. Let's move onto the knowledge base. Here's some useful jargon. You and I are “knowledge engineers” in the context of our translation of the Bible into XPL format. Priests are “domain specialists”: Heaven and Hell is their domain. To this I would add that people who create expert systems languages, as we are doing at this moment, are presumably “metadomain specialists” (I believe this is an original coinage).
It gets boring telling XPL everything twice. For example:
ADD 'LAMECH IS-OFFSPRING-OF METHUSAEL' ADD 'METHUSAEL IS-PARENT-OF LAMECH'
How about defining a rule instead?
ADD 'X IS-PARENT OF Y IF Y IS-OFFSPRING-OF X'
and while we're about it:
ADD 'X IS-OFFSPRING-OF Y IF Y IS-PARENT-OF X'
Not too hard to handle either is:
ADD 'X IS-HUSBAND OF Y IF X IS-MARRIED-TO Y AND X IS-SEX MALE'
Now try:
ADD 'X IS-GRANDCHILD-OF Y IF X IS-OFFSPRING-OF Z AND Z IS-OFFSPRING-OF Y
We've introduced the concept of “place markers”, which are analogous to the local names of the arguments of functions, except that the functions are relations (data) rather than programs. There are many ways to incorporate these into XPL and your own ideas will undoubtedly be better then mine: I use negative numbers in the integer matrix to denote place markers:
MEHUJAEL 1 2 3
IS-OFFSPRING-OF 4 2 1
IRAD 5 2 4
METHUSAEL -1 6 -2
LAMECH -1 2 -3
IS-GRANDCHILD-OF -3 2 -2
The functions DOES and WHICH now become rather more interesting. Suppose we ask the question:
DOES 'ENOCH IS-GRANDCHILD-OF ADAM'
DOES now has to go scouting around to see if this is a bit of known data. If not, it goes to see if it has a rule-based definition for the grandchild relation. If it does, it has to find the first match for Enoch based on the offspring relation, and then pause while it tests whether this match in turn relates to Enoch. If not, it can resume looking for further matches for Enoch. It goes without saying that an XPL function employing recursion is the elegant (possibly the only) way to implement this process. It isn't very difficult to do.
Here's a more interesting rule:
ADD 'X IS-DESCENDANT-OF Y IF X IS-CHILD-OF Y' ADD 'X IS-DESCENDANT-OF Y IF X IS-CHILD-OF Z AND Z IS-DESCENDANT-OF Y'
Recursion groupies will see the classic syntax: the first line denotes the limiting condition, the second line forms the recursive relation. The extensive use of recursive tests based on goal-seeking requests from the user is termed “backtracking” for obvious reasons. The standard mechanism of the state indicator in APL handles such requests beautifully.
I hope that those with some APL experience can see that it is not inordinately tricky to implement these aspects of PROLOG in user-defined APL functions. After a while the imagination starts running riot:
ADD 'X PROBABLY IS-CHILD-OF Y IF X IS-CHILD-OF Z AND Y IS-MARRIED-TO Z'
I suppose there's always the milkman. Adverbs like DEFINITELY, PROBABLY, POSSIBLY, CONCEIVABLY (oops) and INCONCEIVABLY can conveniently carry probability values like 1.0, 0.75, 0.5, 0.25 and 0; if you want to change the climate of expert opinion you can tinker with the values. And don't forget NOT:
ADD 'X IS-SEX FEMALE IF X NOT IS-SEX MALE'
Well, we don't all live in San Francisco.
What about expert systems themselves? If you recall the three elements:

situation model
knowledge base
system manager

then it turns out in languages like PROLOG that the preceding constructs are used on an intermixed basis to represent data in the situation model and rules in the knowledge base. In XPL one feels it may be cleaner to keep them separate, but that's a matter for personal taste.
More importantly, most PROLOG users don't seem to have caught on to the fact that much of the data in the situation model may well be a transform of, for example, data about patients in a medical file whose records have been painstakingly composed over many years. File access to other systems is imperative; most of the facts are already there.
Furthermore, although PROLOG is good at relations, it's very bad at numerical tests. An expert system for shop floor control needs to cope with quantified tests based on quasi-real-time input from process sensors: for example:
ADD 'TURBINE SWITCH-SETTING OFF IF RPM ABOVE 5000' is as far as I know impossible to handle in PROLOG if RPM is intended to be directly linked to the real world, whereas in APL one simply makes RPM a variable shared with an auxiliary processor which handles direct data capture. For non-APLers this means: variable RPM in the APL workspace is dynamically coupled via programmer-specifiable update protocol with an arbitrary machine code segment, which is itself either handling process input/output or is talking to some other process that is. The point is that a robust and very well established framework for this kind of dynamic linking already exists in APL.
I hope I have managed to whet two kinds of appetite. If you know APL already, you may be starting to think about building your own XPL functions (mine are for private consumption and you've had numerous hints already) and writing your own expert system. If you know about expert systems and are toiling with inadequate tools, you might just think about looking at APL instead. An unlikely marriage? I don't think so. No more than Cain's, anyway; the identity of Mrs X is the theological riddle of the millenium. Good luck with such experiments: may thy domain specialties be brought forth and multiply upon the interface of the digital computer.},
  review = 	 {fbie: rejected <2016-01-14 11:59:15>},
}

@inproceedings{Ciricescu:2003:RSV:956417.956540,
  author =	 {Ciricescu, Silviu and Essick, Ray and Lucas, Brian
                  and May, Phil and Moat, Kent and Norris, Jim and
                  Schuette, Michael and Saidi, Ali},
  title =	 {The Reconfigurable Streaming Vector Processor
                  (RSVPTM)},
  booktitle =	 {Proceedings of the 36th Annual IEEE/ACM
                  International Symposium on Microarchitecture},
  series =	 {MICRO 36},
  year =	 2003,
  isbn =	 {0-7695-2043-X},
  pages =	 {141--},
  url =		 {http://dl.acm.org/citation.cfm?id=956417.956540},
  acmid =	 956540,
  publisher =	 {IEEE Computer Society},
  address =	 {Washington, DC, USA},
  abstract = 	 {The need to process multimedia data places largecomputational demands on portable/embedded devices.These multimedia functions share commoncharacteristics: they are computationally intensive anddata-streaming, performing the same operation(s) onmany data elements.The Reconfigurable StreamingVector Processor (RSVPTM) is a vector coprocessorarchitecture that accelerates streaming data operations.Programming the RSVP architecture involves describingthe shape and location of vector streams in memory anddescribing computations as data-flow graphs.Thesedescriptions are intuitive and independent of each other,making the RSVP architecture easy to program.They arealso machine independent, allowing binary-compatibleimplementations with varying cost-performance tradeoffs.This paper presents the RSVP architecture andprogramming model, a programming case study, and ourfirst implementation.Our results show significantspeedups on streaming data functions.Speedups forkernels and applications range from 2 to over 20 timesthat of an ARM9 host processor alone.},
  review = 	 {fbie: rejected <2016-01-14 11:59:24>},
}

@article{Younge:2015:SHP:2817817.2731194,
  author =	 {Younge, Andrew J. and Walters, John Paul and Crago,
                  Stephen P. and Fox, Geoffrey C.},
  title =	 {Supporting High Performance Molecular Dynamics in
                  Virtualized Clusters Using IOMMU, SR-IOV, and
                  GPUDirect},
  journal =	 {SIGPLAN Not.},
  issue_date =	 {July 2015},
  volume =	 50,
  number =	 7,
  month =	 mar,
  year =	 2015,
  issn =	 {0362-1340},
  pages =	 {31--38},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/2817817.2731194},
  doi =		 {10.1145/2817817.2731194},
  acmid =	 2731194,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {cloud computing, gpudirect, iaas, iommu, kvm,
                  molecular dynamics, openstack, sr-iov,
                  virtualization},
  abstract = 	 {Cloud Infrastructure-as-a-Service paradigms have recently shown their utility for a vast array of computational problems, ranging from advanced web service architectures to high throughput computing. However, many scientific computing applications have been slow to adapt to virtualized cloud frameworks. This is due to performance impacts of virtualization technologies, coupled with the lack of advanced hardware support necessary for running many high performance scientific applications at scale. By using KVM virtual machines that leverage both Nvidia GPUs and InfiniBand, we show that molecular dynamics simulations with LAMMPS and HOOMD run at near-native speeds. This experiment also illustrates how virtualized environments can support the latest parallel computing paradigms, including both MPI+CUDA and new GPUDirect RDMA functionality. Specific findings show initial promise in scaling of such applications to larger production deployments targeting large scale computational workloads.},
  review = 	 {fbie: rejected <2016-01-14 11:59:38>},
}

@inproceedings{Younge:2015:SHP:2731186.2731194,
  author =	 {Younge, Andrew J. and Walters, John Paul and Crago,
                  Stephen P. and Fox, Geoffrey C.},
  title =	 {Supporting High Performance Molecular Dynamics in
                  Virtualized Clusters Using IOMMU, SR-IOV, and
                  GPUDirect},
  booktitle =	 {Proceedings of the 11th ACM SIGPLAN/SIGOPS
                  International Conference on Virtual Execution
                  Environments},
  series =	 {VEE '15},
  year =	 2015,
  isbn =	 {978-1-4503-3450-1},
  location =	 {Istanbul, Turkey},
  pages =	 {31--38},
  numpages =	 8,
  url =		 {http://doi.acm.org/10.1145/2731186.2731194},
  doi =		 {10.1145/2731186.2731194},
  acmid =	 2731194,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {cloud computing, gpudirect, iaas, iommu, kvm,
                  molecular dynamics, openstack, sr-iov,
                  virtualization},
  abstract = 	 {Cloud Infrastructure-as-a-Service paradigms have recently shown their utility for a vast array of computational problems, ranging from advanced web service architectures to high throughput computing. However, many scientific computing applications have been slow to adapt to virtualized cloud frameworks. This is due to performance impacts of virtualization technologies, coupled with the lack of advanced hardware support necessary for running many high performance scientific applications at scale. By using KVM virtual machines that leverage both Nvidia GPUs and InfiniBand, we show that molecular dynamics simulations with LAMMPS and HOOMD run at near-native speeds. This experiment also illustrates how virtualized environments can support the latest parallel computing paradigms, including both MPI+CUDA and new GPUDirect RDMA functionality. Specific findings show initial promise in scaling of such applications to larger production deployments targeting large scale computational workloads.},
  review = 	 {fbie: rejected <2016-01-14 11:59:42>},
}

@article{Kumar:2005:DSA:1071690.1064221,
  author =	 {Kumar, Abhishek and Sung, Minho and Xu, Jun (Jim)
                  and Zegura, Ellen W.},
  title =	 {A Data Streaming Algorithm for Estimating
                  Subpopulation Flow Size Distribution},
  journal =	 {SIGMETRICS Perform. Eval. Rev.},
  issue_date =	 {June 2005},
  volume =	 33,
  number =	 1,
  month =	 jun,
  year =	 2005,
  issn =	 {0163-5999},
  pages =	 {61--72},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/1071690.1064221},
  doi =		 {10.1145/1071690.1064221},
  acmid =	 1064221,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {EM algorithm, data streaming, flow statistics,
                  statistical inference, traffic analysis},
  abstract = 	 {Statistical information about the flow sizes in the traffic passing through a network link helps a network operator to characterize network resource usage, infer traffic demands, detect traffic anomalies, and improve network performance through traffic engineering. Previous work on estimating the flow size distribution for the complete population of flows has produced techniques that either make inferences from sampled network traffic, or use data streaming approaches. In this work, we identify and solve a more challenging problem of estimating the size distribution and other statistical information about arbitrary subpopulations of flows. Inferring subpopulation flow statistics is more challenging than the complete population counterpart, since subpopulations of interest are often specified a posteriori (i.e., after the data collection is done), making it impossible for the data collection module to "plan in advance".Our solution consists of a novel mechanism that combines data streaming with traditional packet sampling to provide highly accurate estimates of subpopulation flow statistics. The algorithm employs two data collection modules operating in parallel --- a NetFlow-like packet sampler and a streaming data structure made up of an array of counters. Combining the data collected by these two modules, our estimation algorithm uses a statistical estimation procedure that correlates and decodes the outputs (observations) from both data collection modules to obtain flow statistics for any arbitrary subpopulation. Evaluations of this algorithm on real-world Internet traffic traces demonstrate its high measurement accuracy.},
  review = 	 {fbie: rejected <2016-01-14 12:00:00>},
}

@inproceedings{Kumar:2005:DSA:1064212.1064221,
  author =	 {Kumar, Abhishek and Sung, Minho and Xu, Jun (Jim)
                  and Zegura, Ellen W.},
  title =	 {A Data Streaming Algorithm for Estimating
                  Subpopulation Flow Size Distribution},
  booktitle =	 {Proceedings of the 2005 ACM SIGMETRICS International
                  Conference on Measurement and Modeling of Computer
                  Systems},
  series =	 {SIGMETRICS '05},
  year =	 2005,
  isbn =	 {1-59593-022-1},
  location =	 {Banff, Alberta, Canada},
  pages =	 {61--72},
  numpages =	 12,
  url =		 {http://doi.acm.org/10.1145/1064212.1064221},
  doi =		 {10.1145/1064212.1064221},
  acmid =	 1064221,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {EM algorithm, data streaming, flow statistics,
                  statistical inference, traffic analysis},
  abstract = 	 {Statistical information about the flow sizes in the traffic passing through a network link helps a network operator to characterize network resource usage, infer traffic demands, detect traffic anomalies, and improve network performance through traffic engineering. Previous work on estimating the flow size distribution for the complete population of flows has produced techniques that either make inferences from sampled network traffic, or use data streaming approaches. In this work, we identify and solve a more challenging problem of estimating the size distribution and other statistical information about arbitrary subpopulations of flows. Inferring subpopulation flow statistics is more challenging than the complete population counterpart, since subpopulations of interest are often specified a posteriori (i.e., after the data collection is done), making it impossible for the data collection module to "plan in advance".Our solution consists of a novel mechanism that combines data streaming with traditional packet sampling to provide highly accurate estimates of subpopulation flow statistics. The algorithm employs two data collection modules operating in parallel --- a NetFlow-like packet sampler and a streaming data structure made up of an array of counters. Combining the data collected by these two modules, our estimation algorithm uses a statistical estimation procedure that correlates and decodes the outputs (observations) from both data collection modules to obtain flow statistics for any arbitrary subpopulation. Evaluations of this algorithm on real-world Internet traffic traces demonstrate its high measurement accuracy.},
  review = 	 {fbie: rejected <2016-01-14 12:00:05>},
}

@article{Schnase:1993:SDM:151480.151521,
  author =	 {Schnase, John L. and Leggett, John J. and Hicks,
                  David L. and Szabo, Ron L.},
  title =	 {Semantic Data Modeling of Hypermedia Associations},
  journal =	 {ACM Trans. Inf. Syst.},
  issue_date =	 {Jan. 1993},
  volume =	 11,
  number =	 1,
  month =	 jan,
  year =	 1993,
  issn =	 {1046-8188},
  pages =	 {27--50},
  numpages =	 24,
  url =		 {http://doi.acm.org/10.1145/151480.151521},
  doi =		 {10.1145/151480.151521},
  acmid =	 151521,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {Many important issues in the design and implementation of hypermedia system functionality focus on the way interobject connections are represented, manipulated, and stored. A prototypic system called HB1 is being designed to meet the storage needs of next-generation hypermedia system architectures. HB1 is referred to as a hyperbase management system (HBMS) because it supports, not only the storage and manipulation of information, but the storage and manipulation of the connectivity data that link information together to form hypermedia. Among HB1's distinctions is its use of a semantic network database system to manage physical storage. Here, basic semantic modeling concepts as they apply to hypermedia systems are reviewed, and experiences using a semantic database system in HB1 are discussed.
Semantic data models attempt to provide more powerful mechanisms for structuring objects than are provided by traditional approaches. In HB1, it was necessary to abstract interobject connectivity, behaviors, and information for hypermedia. Building on top of a semantic database system facilitated such a separation and made the structural aspects of hypermedia conveniently accessible to manipulation. This becomes particularly important in the implementation of structure-related operations such as structural queries. Our experience suggests that an integrated semantic object-oriented database paradigm appears to be superior to purely relational, semantic, or object-oriented methodologies for representing the structurally complex interrelationships that arise in hypermedia.},
  review = 	 {fbie: rejected <2016-01-14 12:00:39>},
}

@article{Gerlek:1995:BIV:200994.201003,
  author =	 {Gerlek, Michael P. and Stoltz, Eric and Wolfe,
                  Michael},
  title =	 {Beyond Induction Variables: Detecting and
                  Classifying Sequences Using a Demand-driven SSA
                  Form},
  journal =	 {ACM Trans. Program. Lang. Syst.},
  issue_date =	 {Jan. 1995},
  volume =	 17,
  number =	 1,
  month =	 jan,
  year =	 1995,
  issn =	 {0164-0925},
  pages =	 {85--122},
  numpages =	 38,
  url =		 {http://doi.acm.org/10.1145/200994.201003},
  doi =		 {10.1145/200994.201003},
  acmid =	 201003,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {constant propagation, def-use chain, demand-driven,
                  induction variable, static single assignment,
                  strength reduction, wraparound variable},
  abstract = 	 {Linear induction variable detection is usually associated with the strength reduction optimization. For restructuring compilers, effective data dependence analysis requires that the compiler detect and accurately describe linear and nonlinear induction variables as well as more general sequences. In this article we present a practical technique for detecting a broader class of linear induction variables than is usually recognized, as well as several other sequence forms, including periodic, polynomial, geometric, monotonic, and wrap-around variables. Our method is based on Factored Use-Def (FUD) chains, a demand-driven representation of the popular Static Single Assignment (SSA) form. In this form, strongly connected components of the associated SSA graph correspond to sequences in the source program: we describe a simple yet efficient algorithm for detecting and classifying these sequences. We have implemented this algorithm in Nascent, our restructuring Fortran 90+ compiler, and we present some results showing the effectiveness of our approach.},
  notes = 	 {Not focusing on any declarative aspects. Such specialized loop optimizations are not relevant.},
  review = 	 {fbie: rejected <2016-01-14 12:04:34>},
}

@article{Zhang:2013:FAF:2457443.2457445,
  author =	 {Zhang, Yan and Zhang, Fan and Jin, Zheming and
                  Bakos, Jason D.},
  title =	 {An FPGA-Based Accelerator for Frequent Itemset
                  Mining},
  journal =	 {ACM Trans. Reconfigurable Technol. Syst.},
  issue_date =	 {May 2013},
  volume =	 6,
  number =	 1,
  month =	 may,
  year =	 2013,
  issn =	 {1936-7406},
  pages =	 {2:1--2:17},
  articleno =	 2,
  numpages =	 17,
  url =		 {http://doi.acm.org/10.1145/2457443.2457445},
  doi =		 {10.1145/2457443.2457445},
  acmid =	 2457445,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {Eclat, Frequent itemset mining, co-processor, data
                  intensive, data mining, high performance computing,
                  reconfigurable, reconfigurable applications},
  abstract = 	 {In this article we describe a Field Programmable Gate Array (FPGA)-based coprocessor architecture for Frequent Itemset Mining (FIM). FIM is a common data mining task used to find frequently occurring subsets amongst a database of sets. FIM is a nonnumerical, data intensive computation and is used in machine learning and computational biology. FIM is particularly expensive---in terms of execution time and memory---when performed on large and/or sparse databases or when applied using a low appearance frequency threshold. Because of this, the development of increasingly efficient FIM algorithms and their mapping to parallel architectures is an active field. Previous attempts to accelerate FIM using FPGAs have relied on performance-limiting strategies such as iterative database loading and runtime logic unit reconfiguration. In this article, we present a novel architecture to implement Eclat, a well-known FIM algorithm. Unlike previous efforts, our technique does not impose limits on the maximum set size as a function of available FPGA logic resources and our design scales well to multiple FPGAs. In addition to a novel hardware design, we also present a corresponding compression scheme for intermediate results that are stored in on-chip memory. On a four-FPGA board, experimental results show up to 68X speedup compared to a highly optimized software implementation.},
  review = 	 {fbie: rejected <2016-01-14 12:04:45>},
}

@article{Hu:2000:HHP:347837.347872,
  author =	 {Hu, Y. Charlie and Jin, Guohua and Johnsson,
                  S. Lennart and Kehagias, Dimitris and Shalaby,
                  Nadia},
  title =	 {HPFBench: A High Performance Fortran Benchmark
                  Suite},
  journal =	 {ACM Trans. Math. Softw.},
  issue_date =	 {March 2000},
  volume =	 26,
  number =	 1,
  month =	 mar,
  year =	 2000,
  issn =	 {0098-3500},
  pages =	 {99--149},
  numpages =	 51,
  url =		 {http://doi.acm.org/10.1145/347837.347872},
  doi =		 {10.1145/347837.347872},
  acmid =	 347872,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {benchmarks, compilers, high performance Fortran},
  abstract = 	 {The high performance Fortran (HPF) benchmark suite HPFBench is designed for evaluating the HPF language and compilers on scalable architectures. The functionality of the benchmarks covers scientific software library functions and application kernels that reflect the computational structure and communication patterns in fluid dynamic simulations, fundamental physics, and molecular studies in chemistry and biology. The benchmarks are characterized in terms of FLOP count, memory usage, communication pattern, local memory accesses, array allocation mechanism, as well as operation and communication counts per iteration. The benchmarks output performance evaluation metrics in the form of elapsed times, FLOP rates, and communication time breakdowns. We also provide a benchmark guide to aid  the choice of subsets of the benchmarks for evaluating particular aspects of an HPF compiler. Furthermore, we report an evaluation of an industry-leading HPF compiler from the Portland Group Inc. using the HPFBench benchmarks on the distributed-memory IBM SP2},
  review = 	 {fbie: rejected <2016-01-14 12:04:51>},
}

@article{Swanson:2007:WA:1233307.1233308,
  author =	 {Swanson, Steven and Schwerin, Andrew and Mercaldi,
                  Martha and Petersen, Andrew and Putnam, Andrew and
                  Michelson, Ken and Oskin, Mark and Eggers, Susan J.},
  title =	 {The WaveScalar Architecture},
  journal =	 {ACM Trans. Comput. Syst.},
  issue_date =	 {May 2007},
  volume =	 25,
  number =	 2,
  month =	 may,
  year =	 2007,
  issn =	 {0734-2071},
  pages =	 {4:1--4:54},
  articleno =	 4,
  numpages =	 54,
  url =		 {http://doi.acm.org/10.1145/1233307.1233308},
  doi =		 {10.1145/1233307.1233308},
  acmid =	 1233308,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {WaveScalar, dataflow computing, multithreading},
  abstract = 	 {Silicon technology will continue to provide an exponential increase in the availability of raw transistors. Effectively translating this resource into application performance, however, is an open challenge that conventional superscalar designs will not be able to meet. We present WaveScalar as a scalable alternative to conventional designs. WaveScalar is a dataflow instruction set and execution model designed for scalable, low-complexity/high-performance processors. Unlike previous dataflow machines, WaveScalar can efficiently provide the sequential memory semantics that imperative languages require. To allow programmers to easily express parallelism, WaveScalar supports pthread-style, coarse-grain multithreading and dataflow-style, fine-grain threading. In addition, it permits blending the two styles within an application, or even a single function. To execute WaveScalar programs, we have designed a scalable, tile-based processor architecture called the WaveCache. As a program executes, the WaveCache maps the program's instructions onto its array of processing elements (PEs). The instructions remain at their processing elements for many invocations, and as the working set of instructions changes, the WaveCache removes unused instructions and maps new ones in their place. The instructions communicate directly with one another over a scalable, hierarchical on-chip interconnect, obviating the need for long wires and broadcast communication. This article presents the WaveScalar instruction set and evaluates a simulated implementation based on current technology. For single-threaded applications, the WaveCache achieves performance on par with conventional processors, but in less area. For coarse-grain threaded applications the WaveCache achieves nearly linear speedup with up to 64 threads and can sustain 7--14 multiply-accumulates per cycle on fine-grain threaded versions of well-known kernels. Finally, we apply both styles of threading to equake from Spec2000 and speed it up by 9x compared to the serial version.},
  review = 	 {fbie: rejected <2016-01-14 12:05:15>},
}

@article{Abbott:1990:RSF:78949.78951,
  author =	 {Abbott, Russell J.},
  title =	 {Resourceful Systems for Fault Tolerance,
                  Reliability, and Safety},
  journal =	 {ACM Comput. Surv.},
  issue_date =	 {March 1990},
  volume =	 22,
  number =	 1,
  month =	 mar,
  year =	 1990,
  issn =	 {0360-0300},
  pages =	 {35--68},
  numpages =	 34,
  url =		 {http://doi.acm.org/10.1145/78949.78951},
  doi =		 {10.1145/78949.78951},
  acmid =	 78951,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {Above all, it is vital to recognize that completely guaranteed behavior is impossible and that there are inherent risks in relying on computer systems in critical environments. The unforeseen consequences are often the most disastrous [Neumann 1986].
Section 1 of this survey reviews the current state of the art of system reliability, safety, and fault tolerance. The emphasis is on the contribution of software to these areas. Section 2 reviews current approaches to software fault tolerance. It discusses why some of the assumptions underlying hardware fault tolerance do not hold for software. It argues that the current software fault tolerance techniques are more accurately thought of as delayed debugging than as fault tolerance. It goes on to show that in providing both backtracking and executable specifications, logic programming offers most of the tools currently used in software fault tolerance. Section 3 presents a generalization of the recovery block approach to software fault tolerance, called resourceful systems. Systems are resourceful if they are able to determine whether they have achieved their goals or, if not, to develop and carry out alternate plans. Section 3 develops an approach to designing resourceful systems based upon a functionally rich architecture and an explicit goal orientation.},
  review = 	 {fbie: rejected <2016-01-14 12:05:27>},
}

@inproceedings{Murray-Lasso:1968:OCA:800167.805398,
  author =	 {Murray-Lasso, M. A. and Kasper, F. J.},
  title =	 {On-line Circuit Analysis and Optimization with
                  Commercially Available Time-shared Computer Systems},
  booktitle =	 {Proceedings of the 5th Annual Design Automation
                  Workshop},
  series =	 {DAC '68},
  year =	 1968,
  location =	 {Washington, D. C., USA},
  pages =	 {13.1--13.32},
  numpages =	 {1.22},
  url =		 {http://doi.acm.org/10.1145/800167.805398},
  doi =		 {10.1145/800167.805398},
  acmid =	 805398,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {A circuit analysis-design-optimization system, DCANAL, has been designed specifically for on-line interactive computer aided circuit design on the General Electric Desk Side Computer System. The system is useful for obtaining all the circuit responses of linear and nonlinear circuits. Transistor and diode operating points, effects of circuit element variations, and the design or optimization of element values for desired circuit responses may be quickly and easily determined. The features of the program system are 1. Availability. All circuit designers at any Bell Telephone Laboratories location may access the program. Only a teletypewriter terminal is needed. 2. Large Circuits. Fifty branch, 30 node circuits with 20 transistors and 20 diodes may be analyzed. 3 Fast Response. Most analyses are performed within 30 seconds. 4. Modeling Flexibility. Circuits may be modeled with resistors, controlled current sources, independent voltage and current sources, and “black boxes” represented as admittance matrices. Transistors and diodes may be modeled with the conventional Ebers-Moll equations or by data tables from measured terminal characteristics. The data tables, or the Ebers-Moll coefficients, may be stored in device files for easy recall. 5. Design Optimization. Selected resistors may be designated as variable, and their values automatically designed to meet specified circuit responses. Circuits may also be automatically optimized by designing resistance values to constrain branch currents and voltages to lie between specified minimum and maximum values. 6. Circuit Description Storage. Circuit descriptions may be saved and stored on files. Several versions of a circuit or several different circuits may be recalled for comparison or additional work. The same circuit file may be used for analysis, design, and optimization. Repeating the lengthy and error prone input is avoided. 7. English Commands. All input commands consist of two English words directly associated with the function to be performed. 8. Selective Output. Particular entries or entire arrays of node voltage, branch current, branch power, transistor and diode voltage and current, and circuit description may be selected as output. 9. Element Modification. Any or all of the circuit elements may be temporarily or permanently modified. 10. Parameter Variation and Sensitivity. Any specified circuit element may be varied over a prescribed range with any circuit response and its sensitivity selected as output. 11. On-Line Distributions. All the up-to-date information on the capabilities, new factors, and modifications to the system are available in an auxiliary instruction program. This provides the user with the latest information when he needs it and where he needs it.},
  review = 	 {fbie: rejected <2016-01-14 12:06:35>},
}

@article{Miller:1990:SSM:84537.84556,
  author =	 {Miller, David J.},
  title =	 {Simulation of a Semiconductor Manufacturing Line},
  journal =	 {Commun. ACM},
  issue_date =	 {Oct. 1990},
  volume =	 33,
  number =	 10,
  month =	 oct,
  year =	 1990,
  issn =	 {0001-0782},
  pages =	 {98--108},
  numpages =	 11,
  url =		 {http://doi.acm.org/10.1145/84537.84556},
  doi =		 {10.1145/84537.84556},
  acmid =	 84556,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {Product turnaround time (TAT) is defined as the clock or elapsed time from wafer release to completed wafer fabrication. It is also known as cycle time, mean elapsed time, or manufacturing lead time. Turnaround time is arguably more important in semiconductor fabrication than in any other industry because it has a major impact on contamination levels, process control capabilities, yield-learning rates and product costs.
Semiconductor manufacturers must strictly control particulate contamination to achieve high device yields. Longer turnaround times increase the opportunity for the particles to migrate to wafer surfaces, even in strict clean-room environments [11]. The negative relationship between time and product yields is shown in Figure 1.
Submicron device fabrication also demands stringent process control capability. Variation in time between steps is a major contributor to process variability, directly affecting process yields. Sequential processes performed minutes apart may produce significantly different results than the identical processes performed hours apart because the properties of materials change over time.
The slope of the yield learning curve is also a function of turnaround time. Slow feedback because of high turnaround times delays problem recognition and verification of solutions. Semiconductor fabrication is especially sensitive to turnaround time because definitive functional results are not available until circuits are completely fabricated on the wafe—typically hundreds of process steps after the raw silicon wafers are released into the manufacturing line. Figure 2 depicts the relationship between turnaround time, shown as a multiple of theoretical or raw process time (RPT), and the yield learning rate.
The impact of longer turnaround time is not limited to reduced yields. Longer TAT also increases product costs. For example, a line with a 1O-day TAT that starts 1,000 wafers a day will have 10,000 wafers of work-in-process (WIP). A line with the same wafer starts per day (WSD) but an 11-day TAT will have 11,000 wafers in WIP. The longer TAT causes higher carrying costs for partially finished goods, more space for WIP storage, additional resources for product tracking and control, and many other additional expenses.
Minimizing turnaround time is critically important in the semiconductor industry due to its major contribution to greater product yields and lower costs. However, it is not the only determinant of success in semiconductor manufacturing. State-of-the-art facilities cost hundreds of millions of dollars to build and equip, due to requirements for cleanliness, vibration control, chemical and gas purity and other components that are specified to angstroms and sub-microns. The semiconductor industry's capital-intensive nature demands high throughput rates and maximum utilization of resources to attain a competitive cost per wafer.
The amount of work-in-process (WIP), also known as line-loading levels, affects both turnaround time and throughput performance. Throughput analysis techniques [4, 6] generate curves demonstrating that infinite levels of work in process maximize throughput (Figure 3) by ensuring that resources never starve for work. Queuing theory analysis [1] produces curves that show minimum line-loading levels produce minimum turnaround times by eliminating time spent queued for busy resources (Figure 4). The inverse shapes of these two performance curves demonstrate the inherent conflict in line-loading decisions when attempting to both maximize throughput and minimize turnaround time.
Wafer fabrication involves hundreds of individual tools performing multiple processes to produce an array of sophisticated end products. Actual turnaround time and throughput curves for each tool depend on many variables unique to that tool, such as arrival rates, service rates, rework rates, failure rates, and starvation and blockage opportunities. Resolution of the line-loading conflict is especially difficult given the complexity of semiconductor manufacturing, and simulation emerges as perhaps the only currently existing methodology for taking into account the detailed interactions among elements in such a manufacturing environment [3, 14].},
  review = 	 {fbie: rejected <2016-01-14 12:06:42>},
}

@article{Cong:2009:FHA:1575774.1575776,
  author =	 {Cong, Jason and Zou, Yi},
  title =	 {FPGA-Based Hardware Acceleration of Lithographic
                  Aerial Image Simulation},
  journal =	 {ACM Trans. Reconfigurable Technol. Syst.},
  issue_date =	 {September 2009},
  volume =	 2,
  number =	 3,
  month =	 sep,
  year =	 2009,
  issn =	 {1936-7406},
  pages =	 {17:1--17:29},
  articleno =	 17,
  numpages =	 29,
  url =		 {http://doi.acm.org/10.1145/1575774.1575776},
  doi =		 {10.1145/1575774.1575776},
  acmid =	 1575776,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  keywords =	 {FPGA, Lithography simulation, coprocessor
                  acceleration},
  abstract = 	 {Lithography simulation, an essential step in design for manufacturability (DFM), is still far from computationally efficient. Most leading companies use large clusters of server computers to achieve acceptable turn-around time. Thus coprocessor acceleration is very attractive for obtaining increased computational performance with a reduced power consumption. This article describes the implementation of a customized accelerator on FPGA using a polygon-based simulation model. An application-specific memory partitioning scheme is designed to meet the bandwidth requirements for a large number of processing elements. Deep loop pipelining and ping-pong buffer based function block pipelining are also implemented in our design. Initial results show a 15X speedup versus the software implementation running on a microprocessor, and more speedup is expected via further performance tuning. The implementation also leverages state-of-art C-to-RTL synthesis tools. At the same time, we also identify the need for manual architecture-level exploration for parallel implementations. Moreover, we implement the algorithm on NVIDIA GPUs using the CUDA programming environment, and provide some useful comparisons for different kinds of accelerators.},
  review = 	 {fbie: rejected <2016-01-14 12:06:48>},
}

@article{Grossman:2005:LET:1053331.1053335,
  author =	 {Grossman, Edward},
  title =	 {Letters},
  journal =	 {Queue},
  issue_date =	 {March 2005},
  volume =	 3,
  number =	 2,
  month =	 mar,
  year =	 2005,
  issn =	 {1542-7730},
  pages =	 {12--14},
  numpages =	 3,
  url =		 {http://doi.acm.org/10.1145/1053331.1053335},
  doi =		 {10.1145/1053331.1053335},
  acmid =	 1053335,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {To submit a letter, e-mail us at feedback@acmqueue.comExtensible Programming, Extended Edition  In &#8220;Extensible Programming for the 21st Century&#8221; (December/January   2004-2005), Gregory V. Wilson considers user-extensible syntax and semantics   to be the &#8220;next big thing&#8221; that is missing from existing programming   systems. Done imperfectly, however, unfamiliar extended syntax can interfere   with readability, creating a Tower of Babel that makes adding resources to a   project expensive. I note that Donn Seeley&#8217;s work, &#8220;How Not to Write   Fortran in Any Language&#8221; (December/January 2004-2005), touches on how   familiar structure helps us understand code. Kurt Guntheroth, Seattle, Washington GREG WILSON RESPONDS: I agree. If misused, syntactic extensibility   will make programs harder to read. However, that&#8217;s true of every abstraction   mechanism&#8212;each time we give programmers new ways of doing things, it allows   the good ones to do better, but the bad ones to do worse. I also believe that   new syntactic forms are no more likely to be unreadable than the mess of nested   function calls, conditionals, and so on that they can replace.*I liked Greg Wilson&#8217;s article, since I think it addresses an important   issue for the future (there already is a Java extender!). But I missed seeing   Dylan (object-oriented language originally developed by Apple Computer) mentioned   in the article. Dylan has implemented key components (control constructs such   as for, while, if, etc.) of the language by means of macros without becoming   as bulky as Scheme.  But one thing really worries me about extensibility. In real life, every community   invents its own language with specific terms, semantics, acronyms, etc. In the   end I am afraid that we will get a new portability issue (between communities)   if extension is too easy. Michael Erdmann, Berlin, Germany GREG WILSON RESPONDS: I like Dylan too. I&#8217;ve always   regretted that it didn&#8217;t become more widely known. I decided to use Scheme   as an example in my article because I thought more people would know about it   and because so many ideas about extensibility were first developed in it. Regarding your worries, I hope that active libraries will lead to exactly   the reverse: if everything needed to make XYZ work (the runtime library, its   link/load descriptors, associated metadata of the kind needed for Servlet deployment,   adapters for various tools, etc.) is bundled in a single format, then portability   should be enhanced, not reduced.*First of all, I&#8217;d like to thank you for creating a magazine that does   not make you drowsy. I really enjoy reading it. It is nice to see the ideas   &#8220;in the rough&#8221;&#8212;before they are proven.  I&#8217;m writing to you about Gregory V. Wilson&#8217;s &#8220;Extensible Programming   for the 21st Century.&#8221; The author suggests that the main presentation   of the programming languages will be XML; although this idea is attractive,   I think it should apply only to the documentation, specifically to the presentation,   and not to the programming code itself. The reason is the right level of abstraction:   when the presentation is too elaborate, the real meaning may escape. There is   a good reason for ACM Queue itself being a text-based magazine and not a multimedia   presentation.  The author did not mention any of the pre-XML extensible languages. I find it   fascinating how TCL was started as an extensible language and ended up as yet   another scripting tool. Dmitriy Vasilev, Saratoga, California GREG WILSON RESPONDS: I agree with your view that &#8220;when   the presentation is too elaborate, the real meaning may escape.&#8221; That&#8217;s   precisely why I advocate separation of model and view in source code: to present   more comprehensible views of programs, we need to break the one-to-one binding   between characters in the source file and glyphs on the screen. I mentioned   Scheme, which predates XML by almost 20 years, because I feel it is still the   most extensible &#8220;little language&#8221; ever created. Forth would have   been another interesting choice.  A Lesson in Self-Healing   I enjoyed reading Michael W. Shapiro&#8217;s &#8220;Self-Healing in Modern Operating   Systems&#8221; (December/January 2004-2005), but wondered why he made no mention   of the IBM AIX error logger and system resource controller. The AIX error logger   provides centralized reporting of hardware errors similar to the &#8220;fault   manager&#8221; described by Shapiro, and can be told to perform different actions   for each error. The AIX system resource controller keeps startup information   for system daemons and has an associated API to communicate with them. It is   clearly a forerunner of the &#8220;service manager&#8221; described in the article.   These features in AIX have been present for 10-15 years, and I missed them sorely   when I moved from an AIX shop to a Solaris shop in 1995. Ed Ravin, Bronx, New York  Fortran Lives  The article &#8220;How Not to Write Fortran in Any Language&#8221; was arrogant   and misleading. Donn Seeley writes, &#8220;No one would want to program in Fortran   today, since many better alternatives are available. But you can write a usable   and maintainable program in Fortran in spite of its many hindrances.&#8221;   I doubt that Seeley knows anything about Fortran 90 or 95.  Obviously, some people do want to use Fortran, because leading companies such   as Fujitsu, Hewlett-Packard, IBM, Intel, and Sun Microsystems sell Fortran 95   compilers, as do several independent vendors. If no new code were being written   in Fortran, they would still be selling Fortran 77 compilers.   Fortran 90 fixed most of the defects of Fortran 77, adding free source form,   dynamic memory allocation, user-defined types, procedure interface checking   at compile time via modules, array operations and sections similar to Matlab,   and other features. Fortran 95 added features for parallel programming from   the High Performance Fortran language, such as pure functions and the forall   construct. The recently approved Fortran 2003 standard is a major revision,   adding object-oriented programming, IEEE arithmetic, interoperability with C,   and other features, as described in a recent book by Michael Metcalf, John Reid,   and Malcolm Cohen (Fortran 95/2003 Explained, Oxford University Press, 2004).     Fortran was too slow to incorporate some important language features, but the   decision about whether to use it today should be based on the features of Fortran   95 that are currently available and those of Fortran 2003 that soon will be.   I hope that future issues of ACM Queue will not contain such ill-informed bashing   of unfashionable programming languages. Vivek Rao, Narberth, Pennsylvania We edit letters for content, style, and length.},
  fullTextUrl =  {http://dl.acm.org/ft_gateway.cfm?id=1053335&ftid=300919&dwn=1&CFID=575239404&CFTOKEN=98241671},
  review = 	 {fbie: rejected <2016-01-14 12:07:03>},
}

@inproceedings{Micco:1987:UCE:322917.322923,
  author =	 {Micco, Mary},
  title =	 {An Undergraduate Curriculum in Expert Systems Design
                  or Knowledge Engineering},
  booktitle =	 {Proceedings of the 15th Annual Conference on
                  Computer Science},
  series =	 {CSC '87},
  year =	 1987,
  isbn =	 {0-89791-218-7},
  location =	 {St. Louis, Missouri, USA},
  pages =	 {36--39},
  numpages =	 4,
  url =		 {http://doi.acm.org/10.1145/322917.322923},
  doi =		 {10.1145/322917.322923},
  acmid =	 322923,
  publisher =	 {ACM},
  address =	 {New York, NY, USA},
  abstract = 	 {By way of introduction, Chatham College is a small liberal arts college that has obtained a grant to implement a curriculum in artificial intelligence even with the minimal equipment and resources that we do possess.
We had been involved with preparing and teaching courses to familiarize undergraduate students with the field of artificial intelligence for several years. Last year we decided to develop a cluster of advanced courses that would turn out 'knowledge engineers'. There is a serious shortage of such skilled people in the Pittsburgh area. We planned for them to go immediately to Industry and begin working on expert systems, with a wide range of applications. These included but were not limited to diagnostic systems, financial modelling applications, and shop floor management. Under the terms of our funding we also encouraged students to become entrepreneurs by first developing and then marketing their own expert systems targeted for specialized markets.
Students selecting this upper level cluster have already completed our core computer science requirements and have a good math background. One of the key factors in this project has been that we worked closely with the business community seeking their input into the curriculum planning process. They also provided us with the internship opportunities we requested and gave the students exposure to a much wider range of equipment than we could afford to provide in house.
The program has been designed to run on IBM PC's with 512K. We purchased copies of Golden Common Lisp and Turbo Prolog as well as using an Altos with Xenix and C to get us started. We are currently negotiating for a grant for 2 Hewlett Packard artificial intelligence workstations with four terminals each. These cost about $50,000 each and are far beyond our price range. Two things are encouraging. Prices are falling very rapidly and the other is that NSF and other foundations are beginning to make instrumentation grants available to smaller institutions for the purchase of sophisticated equipment of this type. Our most difficult task has proven to be finding faculty to help teach these courses. We have had to draw on local industry to help with part time instructors
The whole cluster consists of 6 courses and a 6 credit internship. The first introductory course. Artificial Intelligence, involves an overview of the field. The history of the movement from its roots to its current popular status are explored. We based the course on Patrick Winston's text Artificial Intelligence. Each of the major research areas is explored. The approach taken was one of studying the classic toy problems. This made it possible to show the difficulties that have been encountered at a practical level and then to review progress in the field, while permitting students to gain actual programming experience. For example, in Natural Language we introduce the Eliza program as an early example of an attempt to use the keywords approach. Students are required to produce a modified conversation. We also look at the Turing test. Drawing from Rodger Schank's book The Cognitive Computer we go on to talk about scripts, using the Restaurant Script as an example. Students are asked to put together a travel script in teams of four. This section is wrapped up with a discussion of natural language programs today.
In the area of vision research, so critical in Robotics, we used the classic toy blocks problem, encouraging the students to key it in and modify it. We found that actually working with the code helped the students to gain a much more vivid appreciation of the difficulties involved. Winston has an excellent presentation on the subject. It was difficult to improve on it.
Having introduced two specialized problems in the field, we move the students into the challenge of finding a general problem solving technique. At this stage they have no difficulty in accepting the fact that no such algorithm can be developed and that it is rather a matter of determining the rules of thumb or heuristics governing the specific problem domain with a sufficiently large knowledge base to cover all likely contingencies and options.
To help to clarify this point, we delve into game playing strategies. Students are asked to write a tic tac toe game wherein the computer learns from its own mistakes. The first version they are asked to produce simply stores patterns of play in an array. The program searches for a pattern match. If one is found it will play the next move in the pattern string. If none is found a move is picked at random from those available. As it plays it keeps adding to the number of pattern strings it recognizes. The key element in this program is that the last byte in the pattern string contains a score for that particular pattern. The score for each winning pattern is incremented by one each time it is selected. Losing patterns have one subtracted from their score. Eventually when the score reaches zero that particular pattern will not be played again. The principle involved is that of reward and punishment. The patterns that win are reinforced by means of higher scores, while losing patterns are punished by having points taken away.
The next refinement students are asked to add in, is an algorithm that will assist the computer to pick a better move not at random but with a set of clearly specified rules. This has proven to be an interesting and challenging exercise for our students. It has provided them with a thorough backdrop for an in-depth discussion of game-playing strategies…brute force versus attempting to minimize the search path.
To help students to gain a better appreciation of the work being done, we also made use of many examples of industrial applications captured on videotape from the Texas Instruments satellite broadcasts done last year. The lectures by various prominent figures were used to stimulate discussion of the relative benefits of artificial intelligence applications and the new technologies available.
This leads very naturally to the final topic, the Japanese Challenge and 'our response to it, particularly in the development of new hardware. We look at the progress in VLSI chips and parallel processing that is making possible much larger and more sophisticated applications.
Obviously it is also important to give the students training in the languages of the field. We have developed a course that concentrates on Lisp and then follow it with a course in Prolog. I would like to have added several other of the artificial intelligence languages including Ops5, and Flavors but decided to wait until we obtained the artificial intelligence workstations in order to keep our software costs down to manageable levels.
The Lisp course, based on Golden Common Lisp, available for micros with 512k minimum, focuses on the structure of list processing languages in general and the use of primitive functions to build ever more complex functions to perform all sorts of information processing tasks on lists. It took a considerable amount of time to select the version of Lisp that would provide all the functions that we required for the course. While Golden Common Lisp was not as comprehensive as the version of Franz Lisp we had been running on a VAX mainframe, it did have enough features to support a first course.
During the course students have a number of small assignments and are required to write a variety of functions. Here the emphasis is on the mechanics of list processing languages. We usually finish up with two larger assignments. The first is an inheritance problem where students explore the use of properties and a search problem where they are asked to get the computer to determine the best route from one city to any other when several options are available. A variety of search strategies are proposed and then compared…breadth first, depth first, hill climbing. The text we used for this course was Robert Wilensky's Lispcraft.
Meanwhile, running concurrently, students have also been taking a course in Learning, Memory and Cognition, where they are studying in some depth human information processing and problem solving behavior. In our case, because we do not have the expertise in the Computer Science Department to handle this course it is being taught by the Psychology Department. The course is important because it provides a strong theoretical base upon which to build. If, in fact, we can develop computers that can think like humans and exercise judgment, then we must have a good working knowledge of how we do in fact perform these tasks ourselves.
A constant theme running through these courses is the concept that there is no one problem solving technique but rather that in each type of problem specialised knowledge is needed and in addition we need to have rules of thumb or heuristics to guide us when more than one option is available.
In the Prolog course students begin working with developing several small expert systems. We build a diagnostic system using the bicycle repair one as a model. Students are free to choose their own topic. They have ranged from diagnosing physical symptoms of illness to lawnmover repair. Here they are working with a very limited domain. As part of this course we discuss numerous examples of expert systems currently being used. Another assignment that students find quite challenging is the Monkey and Bananas toy problem, taken from Lee Brownston's book “Programming Expert Systems in Ops5: an introduction to rule based programming”. Here the problem domain is limited to a room with a number of objects in it. The monkey is presented with the problem of reaching the bananas no matter where they may be positioned in the room, even if they appear on the ceiling. Several other objects are in the room, including a ladder, a sofa. Students explore in some detail how to handle all the various sets of rules that are needed depending on the room layout and the actual location of the bananas.
At the same time students are being given a course in C running in Unix to give them exposure to a portable language that could be called from the Lisp programs and that would help to make their applications run more efficiently. It was clear from the industry experts that we talked to that in many cases the expert systems are rewritten in C after first being developed in either Lisp or Prolog.
Having built a reasonably firm foundation, the students are then moved into the Expert systems design course where they undertake more ambitious projects of their own under directed supervision. The first time this course was offered we did not yet have any industry contacts and were limited to what was available on campus. We selected as our project the design of an expert system that would provide the students some first hand experience at knowledge representation. They were to develop a system that would substitute for the reference librarian in the library. The great advantage of this project was that the expert whose knowledge base was to be tapped was readily available to the students.
In the first few weeks of the course the students discussed the selection criteria for good projects. Then the class as a whole interviewed one of our reference librarians, Irma Smith who had agreed to participate as our resident expert. The class explained what they were trying to accomplish and decided with her input to use the case study approach. Each group of students then met with her individually to evaluate how she deals with 3 particular reference requests. They were asked to develop sets of rules for the three case studies assigned to them, and develop a prototype system. This took a couple of weeks. In the meantime they were getting lectures on managing costs and risks in project development.
The next stage in the process involved meeting as a whole class and combining the best features of each of the different sets of rules. The variety was surprising, and promoted a great deal of heated discussion as the different teams defended their approach to the problem. By the time the students had merged all the rules they had come up with, it was clear that the rules could not simply be added at random. We discussed breaking the process down into clearly defined subsets of rules that would be accessed only when needed for specific applications. Modular design was the best available solution to the problem that presented itself and helped to ensure that the correct sets of rules would be triggered.
The need for testing was obvious and each team was asked to test the merged system with the specific cases that they had started with. This necessitated tracing through the firing of the rules and tinkering with the firing order. This proved to be much more time consuming than we had anticipated.
At this point we introduce the problem of integrating their little expert system into the existing environment of the library. Here we used the existing and proposed library automation projects and explored how to integrate our expert system as a front end to direct the patron to the appropriate on line databases. Students quickly realized that there was a great deal more involved than had appeared at first. They explored the interface with the on-line card catalog, discovered that they would also have to connect with the circulation system and the interlibrary loan system.
The problem of technology transfer was discussed at length...why it occurs, how to recognize problems before they get out of hand, how to ease the introduction of new technology. Several of the library staff felt very threatened by the project and were very critical of the students' efforts.
Following this step, students were asked to explore cost estimating techniques for sizeable systems. The next step involved a critical analysis of the user interface. The importance of considering the humans who must use the systems and different techniques for building friendlier user interfaces were explored. Particular emphasis was placed on the use of graphics in the front end design. Students were shown a number of interfaces, both good and bad then they were asked to develop a proposal for their own system. We did not require them to develop any code because of the time constraints.
Although originally we intended to get the students to work on an industrial application in the Expert Systems Design course, the library project proved to be very successful particularly because the problems were reasonably familiar to the students and they have ready access to a real expert who can be interviewed at some length. Another great advantage from our point of view is that the project could be repeated each time the course is offered.
Because of the need to integrate their systems with databases of various types, we strongly encourage students to also take a course in Data Base Management Systems. We also hope to add in a required course in Graphics or Human Factors in Systems with a strong emphasis on designing better front-ends.
By the time students have completed this sequence of six courses in a minimum of two semesters, they are ready for an internship in industry helping to develop real, working expert systems. Our limited experience in finding internships suggests that many manufacturers are interested in diagnostic systems of one sort or another, and financial institutions are also beginning to be interested in possible applications to the financial management area. However it is still difficult to find people who are willing to help the students to have a good experience and there are not many good field work supervisors available. The “toy problem” approach to teaching has worked well for us. We start the students out with examples from Ph.D. dissertations then gradually build them up to designing substantial expert systems of their own, thereby providing them with practical experience with many different approaches and techniques.
In closing I should mention that this 'expert systems design' track as we have chosen to call it is still very new and experimental. We are still learning from our mistakes.},
  review = 	 {fbie: rejected <2016-01-14 12:07:15>},
}