From 09d8a2f2153b68c242429d8f3be9c2f41b8207fb Mon Sep 17 00:00:00 2001 From: Kaarthik Sivashanmugam Date: Thu, 25 Aug 2016 09:04:19 -0700 Subject: [PATCH 01/15] updating examples to use Mobius 2.0 preview release and minor updates --- README.md | 2 +- build/Build.cmd | 2 +- csharp/SparkCLR.nuspec | 2 +- dev/scripts/SetSparkClrPackageVersion.ps1 | 8 ++++---- examples/Batch/WordCount/WordCount.csproj | 6 +++--- examples/Batch/WordCount/packages.config | 2 +- examples/Batch/pi/Pi.csproj | 6 +++--- examples/Batch/pi/packages.config | 2 +- examples/Sql/CassandraDataFrame/CassandraDataFrame.csproj | 4 ++-- examples/Sql/CassandraDataFrame/packages.config | 2 +- examples/Sql/HiveDataFrame/HiveDataFrame.csproj | 4 ++-- examples/Sql/HiveDataFrame/packages.config | 2 +- examples/Sql/JdbcDataFrame/JdbcDataFrame.csproj | 6 +++--- examples/Sql/JdbcDataFrame/packages.config | 2 +- examples/Sql/SparkXml/SparkXml.csproj | 6 +++--- examples/Sql/SparkXml/packages.config | 2 +- examples/Streaming/EventHub/EventHub.csproj | 6 +++--- examples/Streaming/EventHub/packages.config | 2 +- examples/Streaming/HdfsWordCount/HdfsWordCount.csproj | 6 +++--- examples/Streaming/HdfsWordCount/packages.config | 2 +- examples/Streaming/Kafka/Kafka.csproj | 4 ++-- examples/Streaming/Kafka/packages.config | 2 +- examples/fsharp/JsonDataFrame/JsonDataFrame.fsproj | 6 +++--- examples/fsharp/JsonDataFrame/packages.config | 2 +- examples/fsharp/WordCount/WordCountFSharp.fsproj | 6 +++--- examples/fsharp/WordCount/packages.config | 2 +- 26 files changed, 48 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 2dc6d9ab..a1ce8255 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,7 @@ Refer to the [docs folder](docs) for design overview and other info on Mobius ## Supported Spark Versions -Mobius is built and tested with Apache Spark [1.4.1](https://github.com/Microsoft/Mobius/tree/branch-1.4), [1.5.2](https://github.com/Microsoft/Mobius/tree/branch-1.5) and [1.6.*](https://github.com/Microsoft/Mobius/tree/branch-1.6). +Mobius is built and tested with Apache Spark [1.4.1](https://github.com/Microsoft/Mobius/tree/branch-1.4), [1.5.2](https://github.com/Microsoft/Mobius/tree/branch-1.5), [1.6.*](https://github.com/Microsoft/Mobius/tree/branch-1.6) and [2.0](https://github.com/Microsoft/Mobius/tree/branch-2.0). ## Releases diff --git a/build/Build.cmd b/build/Build.cmd index 7a83f5bd..96166fd0 100644 --- a/build/Build.cmd +++ b/build/Build.cmd @@ -225,7 +225,7 @@ if not defined ProjectVersion ( goto :distdone ) -set SPARKCLR_NAME=spark-clr_2.10-%ProjectVersion% +set SPARKCLR_NAME=spark-clr_2.11-%ProjectVersion% @echo "%SPARKCLR_HOME% @rem copy samples to top-level folder before zipping diff --git a/csharp/SparkCLR.nuspec b/csharp/SparkCLR.nuspec index 9c027b5d..d725b2f5 100644 --- a/csharp/SparkCLR.nuspec +++ b/csharp/SparkCLR.nuspec @@ -2,7 +2,7 @@ Microsoft.SparkCLR - 1.6.200-SNAPSHOT + 2.0.000-SNAPSHOT Microsoft Corporation Microsoft Corporation https://github.com/Microsoft/Mobius/blob/master/LICENSE diff --git a/dev/scripts/SetSparkClrPackageVersion.ps1 b/dev/scripts/SetSparkClrPackageVersion.ps1 index b9693b24..3de5b4b4 100644 --- a/dev/scripts/SetSparkClrPackageVersion.ps1 +++ b/dev/scripts/SetSparkClrPackageVersion.ps1 @@ -17,15 +17,15 @@ function Update-Csproj($targetDir, $version) Write-Output "[SetSparkClrPackageVersion.Update-Csproj] Start setting *.csproj under $targetDir to version=$version" # - # Update Mobius package version to this release. Example in *.csproj: + # Update Mobius package version to this release. Example in *.csproj and *.fsproj: # ..\packages\Microsoft.SparkCLR.1.5.2-SNAPSHOT\lib\net45\CSharpWorker.exe # - Get-ChildItem $targetDir -filter "*.csproj" -recurs | % { - Write-Output "[SetSparkClrPackageVersion.Update-Csproj] updating $($_.FullName)" + Get-ChildItem $targetDir -filter "*.*sproj" -recurs | % { + Write-Output "[SetSparkClrPackageVersion.Update-*sproj] updating $($_.FullName)" ((Get-Content $_.FullName) -replace "\\Microsoft\.SparkCLR.*\\lib", "\Microsoft.SparkCLR.$version\lib") | Set-Content -Encoding UTF8 -Path $_.FullName -force } - Write-Output "[SetSparkClrPackageVersion.Update-Csproj] Done setting *.csproj under $targetDir to version=$version" + Write-Output "[SetSparkClrPackageVersion.Update-Csproj] Done setting *.csproj and *.fsproj under $targetDir to version=$version" } function Update-PackageConfig($targetDir, $version) diff --git a/examples/Batch/WordCount/WordCount.csproj b/examples/Batch/WordCount/WordCount.csproj index a9f0dfc8..40b8f52a 100644 --- a/examples/Batch/WordCount/WordCount.csproj +++ b/examples/Batch/WordCount/WordCount.csproj @@ -34,7 +34,7 @@ False - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\CSharpWorker.exe + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe False @@ -42,7 +42,7 @@ False - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll False @@ -67,7 +67,7 @@ - + CSharpWorker.exe.config diff --git a/examples/Batch/WordCount/packages.config b/examples/Batch/WordCount/packages.config index fb0cfe9e..293105d3 100644 --- a/examples/Batch/WordCount/packages.config +++ b/examples/Batch/WordCount/packages.config @@ -4,5 +4,5 @@ - + diff --git a/examples/Batch/pi/Pi.csproj b/examples/Batch/pi/Pi.csproj index 058fa0ca..751852f8 100644 --- a/examples/Batch/pi/Pi.csproj +++ b/examples/Batch/pi/Pi.csproj @@ -37,7 +37,7 @@ False - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\CSharpWorker.exe + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe False @@ -45,7 +45,7 @@ False - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll False @@ -66,7 +66,7 @@ - + CSharpWorker.exe.config diff --git a/examples/Batch/pi/packages.config b/examples/Batch/pi/packages.config index 88903cd0..eaa63869 100644 --- a/examples/Batch/pi/packages.config +++ b/examples/Batch/pi/packages.config @@ -1,7 +1,7 @@  - + diff --git a/examples/Sql/CassandraDataFrame/CassandraDataFrame.csproj b/examples/Sql/CassandraDataFrame/CassandraDataFrame.csproj index 17849686..228764bd 100644 --- a/examples/Sql/CassandraDataFrame/CassandraDataFrame.csproj +++ b/examples/Sql/CassandraDataFrame/CassandraDataFrame.csproj @@ -35,13 +35,13 @@ - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\CSharpWorker.exe + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll False diff --git a/examples/Sql/CassandraDataFrame/packages.config b/examples/Sql/CassandraDataFrame/packages.config index fb0cfe9e..293105d3 100644 --- a/examples/Sql/CassandraDataFrame/packages.config +++ b/examples/Sql/CassandraDataFrame/packages.config @@ -4,5 +4,5 @@ - + diff --git a/examples/Sql/HiveDataFrame/HiveDataFrame.csproj b/examples/Sql/HiveDataFrame/HiveDataFrame.csproj index 00d1ff2d..6c81a1d4 100644 --- a/examples/Sql/HiveDataFrame/HiveDataFrame.csproj +++ b/examples/Sql/HiveDataFrame/HiveDataFrame.csproj @@ -38,11 +38,11 @@ ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\CSharpWorker.exe + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe True - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll True diff --git a/examples/Sql/HiveDataFrame/packages.config b/examples/Sql/HiveDataFrame/packages.config index e0de1c95..218d018f 100644 --- a/examples/Sql/HiveDataFrame/packages.config +++ b/examples/Sql/HiveDataFrame/packages.config @@ -1,7 +1,7 @@  - + diff --git a/examples/Sql/JdbcDataFrame/JdbcDataFrame.csproj b/examples/Sql/JdbcDataFrame/JdbcDataFrame.csproj index 80c1565b..f19954c4 100644 --- a/examples/Sql/JdbcDataFrame/JdbcDataFrame.csproj +++ b/examples/Sql/JdbcDataFrame/JdbcDataFrame.csproj @@ -36,7 +36,7 @@ False - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\CSharpWorker.exe + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe False @@ -44,7 +44,7 @@ False - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll False @@ -65,7 +65,7 @@ - + CSharpWorker.exe.config diff --git a/examples/Sql/JdbcDataFrame/packages.config b/examples/Sql/JdbcDataFrame/packages.config index fb0cfe9e..293105d3 100644 --- a/examples/Sql/JdbcDataFrame/packages.config +++ b/examples/Sql/JdbcDataFrame/packages.config @@ -4,5 +4,5 @@ - + diff --git a/examples/Sql/SparkXml/SparkXml.csproj b/examples/Sql/SparkXml/SparkXml.csproj index 22675f61..381ec5db 100644 --- a/examples/Sql/SparkXml/SparkXml.csproj +++ b/examples/Sql/SparkXml/SparkXml.csproj @@ -36,7 +36,7 @@ False - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\CSharpWorker.exe + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe False @@ -44,7 +44,7 @@ False - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll False @@ -65,7 +65,7 @@ - + CSharpWorker.exe.config diff --git a/examples/Sql/SparkXml/packages.config b/examples/Sql/SparkXml/packages.config index fb0cfe9e..293105d3 100644 --- a/examples/Sql/SparkXml/packages.config +++ b/examples/Sql/SparkXml/packages.config @@ -4,5 +4,5 @@ - + diff --git a/examples/Streaming/EventHub/EventHub.csproj b/examples/Streaming/EventHub/EventHub.csproj index 057c6f88..631b2c3d 100644 --- a/examples/Streaming/EventHub/EventHub.csproj +++ b/examples/Streaming/EventHub/EventHub.csproj @@ -35,7 +35,7 @@ - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\CSharpWorker.exe + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe False @@ -43,7 +43,7 @@ - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll False @@ -68,7 +68,7 @@ - + CSharpWorker.exe.config diff --git a/examples/Streaming/EventHub/packages.config b/examples/Streaming/EventHub/packages.config index edebda46..abe733c5 100644 --- a/examples/Streaming/EventHub/packages.config +++ b/examples/Streaming/EventHub/packages.config @@ -4,7 +4,7 @@ - + diff --git a/examples/Streaming/HdfsWordCount/HdfsWordCount.csproj b/examples/Streaming/HdfsWordCount/HdfsWordCount.csproj index b38df4b6..277fa405 100644 --- a/examples/Streaming/HdfsWordCount/HdfsWordCount.csproj +++ b/examples/Streaming/HdfsWordCount/HdfsWordCount.csproj @@ -38,7 +38,7 @@ False - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\CSharpWorker.exe + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe False @@ -46,7 +46,7 @@ False - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll False @@ -64,7 +64,7 @@ - + PreserveNewest diff --git a/examples/Streaming/HdfsWordCount/packages.config b/examples/Streaming/HdfsWordCount/packages.config index 88903cd0..eaa63869 100644 --- a/examples/Streaming/HdfsWordCount/packages.config +++ b/examples/Streaming/HdfsWordCount/packages.config @@ -1,7 +1,7 @@  - + diff --git a/examples/Streaming/Kafka/Kafka.csproj b/examples/Streaming/Kafka/Kafka.csproj index e1608f57..c221fa81 100644 --- a/examples/Streaming/Kafka/Kafka.csproj +++ b/examples/Streaming/Kafka/Kafka.csproj @@ -33,14 +33,14 @@ - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\CSharpWorker.exe + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe False ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll False diff --git a/examples/Streaming/Kafka/packages.config b/examples/Streaming/Kafka/packages.config index 88903cd0..eaa63869 100644 --- a/examples/Streaming/Kafka/packages.config +++ b/examples/Streaming/Kafka/packages.config @@ -1,7 +1,7 @@  - + diff --git a/examples/fsharp/JsonDataFrame/JsonDataFrame.fsproj b/examples/fsharp/JsonDataFrame/JsonDataFrame.fsproj index 5ab9eaa3..b49e9de0 100644 --- a/examples/fsharp/JsonDataFrame/JsonDataFrame.fsproj +++ b/examples/fsharp/JsonDataFrame/JsonDataFrame.fsproj @@ -66,13 +66,13 @@ - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\CSharpWorker.exe + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll @@ -98,4 +98,4 @@ --> - \ No newline at end of file + diff --git a/examples/fsharp/JsonDataFrame/packages.config b/examples/fsharp/JsonDataFrame/packages.config index 1b77c005..941d504a 100644 --- a/examples/fsharp/JsonDataFrame/packages.config +++ b/examples/fsharp/JsonDataFrame/packages.config @@ -2,7 +2,7 @@ - + diff --git a/examples/fsharp/WordCount/WordCountFSharp.fsproj b/examples/fsharp/WordCount/WordCountFSharp.fsproj index 4700233f..3b40aad9 100644 --- a/examples/fsharp/WordCount/WordCountFSharp.fsproj +++ b/examples/fsharp/WordCount/WordCountFSharp.fsproj @@ -71,7 +71,7 @@ - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\CSharpWorker.exe + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe True @@ -83,7 +83,7 @@ True - ..\..\packages\Microsoft.SparkCLR.1.6.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll True @@ -110,4 +110,4 @@ --> - \ No newline at end of file + diff --git a/examples/fsharp/WordCount/packages.config b/examples/fsharp/WordCount/packages.config index 1b77c005..941d504a 100644 --- a/examples/fsharp/WordCount/packages.config +++ b/examples/fsharp/WordCount/packages.config @@ -2,7 +2,7 @@ - + From 9f4276e3386ba372f849fba2be8918f68e2972f4 Mon Sep 17 00:00:00 2001 From: Yun Tang Date: Thu, 1 Sep 2016 02:33:41 +0800 Subject: [PATCH 02/15] [Mobius-102] Add missing license text in batch and powershell scripts (#557) --- build/Build.cmd | 5 +++++ build/build.sh | 5 +++++ build/copyjar.ps1 | 5 +++++ build/localmode/RunSamples.cmd | 6 ++++++ build/localmode/downloadtools.ps1 | 5 +++++ build/localmode/dumpsoftware.ps1 | 5 +++++ build/localmode/nugetpack.ps1 | 5 +++++ build/localmode/patchpom.ps1 | 5 +++++ build/localmode/precheck.cmd | 5 +++++ build/localmode/run-samples.sh | 5 +++++ build/localmode/zipdir.ps1 | 4 +++- cpp/Build.cmd | 5 +++++ cpp/Clean.cmd | 6 ++++++ csharp/Build.cmd | 5 +++++ csharp/Clean.cmd | 6 ++++++ csharp/Test.cmd | 5 +++++ csharp/build.sh | 5 +++++ csharp/clean.sh | 5 +++++ csharp/test.sh | 5 +++++ dev/scripts/SetSparkClrJarVersion.ps1 | 5 +++++ dev/scripts/SetSparkClrNugetPackageVersion.ps1 | 5 +++++ dev/scripts/SetSparkClrPackageVersion.ps1 | 5 +++++ dev/scripts/SetVersion.cmd | 6 ++++++ examples/Build.cmd | 5 +++++ examples/Clean.cmd | 6 ++++++ examples/build.sh | 5 +++++ examples/clean.sh | 5 +++++ scripts/sparkclr-submit.cmd | 6 ++++++ scripts/sparkclr-submit.sh | 5 +++++ 29 files changed, 149 insertions(+), 1 deletion(-) diff --git a/build/Build.cmd b/build/Build.cmd index 96166fd0..05239aca 100644 --- a/build/Build.cmd +++ b/build/Build.cmd @@ -1,6 +1,11 @@ @setlocal @echo OFF +rem +rem Copyright (c) Microsoft. All rights reserved. +rem Licensed under the MIT license. See LICENSE file in the project root for full license information. +rem + if "%1" == "csharp" set buildCSharp=true SET CMDHOME=%~dp0 diff --git a/build/build.sh b/build/build.sh index 32f17577..5e935cd5 100755 --- a/build/build.sh +++ b/build/build.sh @@ -1,5 +1,10 @@ #!/bin/bash +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + export FWDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" export SPARKCLR_HOME="$FWDIR/runtime" diff --git a/build/copyjar.ps1 b/build/copyjar.ps1 index ea7184a4..c67830ef 100755 --- a/build/copyjar.ps1 +++ b/build/copyjar.ps1 @@ -1,3 +1,8 @@ +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + function Get-ScriptDirectory { $Invocation = (Get-Variable MyInvocation -Scope 1).Value; diff --git a/build/localmode/RunSamples.cmd b/build/localmode/RunSamples.cmd index dec79cf9..6ad9094c 100644 --- a/build/localmode/RunSamples.cmd +++ b/build/localmode/RunSamples.cmd @@ -1,4 +1,10 @@ @echo OFF + +rem +rem Copyright (c) Microsoft. All rights reserved. +rem Licensed under the MIT license. See LICENSE file in the project root for full license information. +rem + setlocal enabledelayedexpansion SET CMDHOME=%~dp0 diff --git a/build/localmode/downloadtools.ps1 b/build/localmode/downloadtools.ps1 index f363e21a..b71f355d 100644 --- a/build/localmode/downloadtools.ps1 +++ b/build/localmode/downloadtools.ps1 @@ -1,3 +1,8 @@ +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + # # Input - # (1) "stage" parameter, accepts either "build" or "run" diff --git a/build/localmode/dumpsoftware.ps1 b/build/localmode/dumpsoftware.ps1 index 5e5462e3..9943bf57 100644 --- a/build/localmode/dumpsoftware.ps1 +++ b/build/localmode/dumpsoftware.ps1 @@ -1,3 +1,8 @@ +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + $x64items = @(Get-ChildItem "HKLM:SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall") $x64items + @(Get-ChildItem "HKLM:SOFTWARE\wow6432node\Microsoft\Windows\CurrentVersion\Uninstall") ` | ForEach-object { Get-ItemProperty Microsoft.PowerShell.Core\Registry::$_ } ` diff --git a/build/localmode/nugetpack.ps1 b/build/localmode/nugetpack.ps1 index 9724a305..fc603d02 100644 --- a/build/localmode/nugetpack.ps1 +++ b/build/localmode/nugetpack.ps1 @@ -1,3 +1,8 @@ +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + $root = (split-path -parent $MyInvocation.MyCommand.Definition) + '\..\..' # expected tagname: v{version-string}. E.g., "v1.5.2-snapshot-2", "v1.5.2-prerelease-1" diff --git a/build/localmode/patchpom.ps1 b/build/localmode/patchpom.ps1 index 9608635f..cd74daf8 100644 --- a/build/localmode/patchpom.ps1 +++ b/build/localmode/patchpom.ps1 @@ -1,3 +1,8 @@ +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + # # Input - # "targetPom" parameter, target Pom.xml file diff --git a/build/localmode/precheck.cmd b/build/localmode/precheck.cmd index dc0217ed..45511aa7 100644 --- a/build/localmode/precheck.cmd +++ b/build/localmode/precheck.cmd @@ -1,5 +1,10 @@ @echo OFF +rem +rem Copyright (c) Microsoft. All rights reserved. +rem Licensed under the MIT license. See LICENSE file in the project root for full license information. +rem + set precheck=ok if not exist "%JAVA_HOME%\bin\java.exe" ( diff --git a/build/localmode/run-samples.sh b/build/localmode/run-samples.sh index 48165bf5..5c6486bb 100755 --- a/build/localmode/run-samples.sh +++ b/build/localmode/run-samples.sh @@ -1,5 +1,10 @@ #!/bin/bash +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + export verbose= for param in "$@" diff --git a/build/localmode/zipdir.ps1 b/build/localmode/zipdir.ps1 index 4620c928..1073a1ba 100644 --- a/build/localmode/zipdir.ps1 +++ b/build/localmode/zipdir.ps1 @@ -1,8 +1,10 @@ +# # Copyright (c) Microsoft. All rights reserved. # Licensed under the MIT license. See LICENSE file in the project root for full license information. # + # This script takes in "dir" and "target" parameters, zips all files under dir to the target file -# + Param([string]$dir, [string]$target) diff --git a/cpp/Build.cmd b/cpp/Build.cmd index 1feef374..42d2e639 100644 --- a/cpp/Build.cmd +++ b/cpp/Build.cmd @@ -1,6 +1,11 @@ @setlocal @ECHO off +rem +rem Copyright (c) Microsoft. All rights reserved. +rem Licensed under the MIT license. See LICENSE file in the project root for full license information. +rem + SET CMDHOME=%~dp0 @REM Remove trailing backslash \ set CMDHOME=%CMDHOME:~0,-1% diff --git a/cpp/Clean.cmd b/cpp/Clean.cmd index 2a978baa..bf6d5b03 100644 --- a/cpp/Clean.cmd +++ b/cpp/Clean.cmd @@ -1,4 +1,10 @@ @ECHO OFF + +rem +rem Copyright (c) Microsoft. All rights reserved. +rem Licensed under the MIT license. See LICENSE file in the project root for full license information. +rem + FOR /D /R . %%G IN (bin) DO @IF EXIST "%%G" (@echo RDMR /S /Q "%%G" & rd /s /q "%%G") FOR /D /R . %%G IN (obj) DO @IF EXIST "%%G" (@echo RDMR /S /Q "%%G" & rd /s /q "%%G") FOR /D /R . %%G IN (x64) DO @IF EXIST "%%G" (@echo RDMR /S /Q "%%G" & rd /s /q "%%G") \ No newline at end of file diff --git a/csharp/Build.cmd b/csharp/Build.cmd index 6c2b36d3..a9499d3b 100644 --- a/csharp/Build.cmd +++ b/csharp/Build.cmd @@ -1,6 +1,11 @@ @setlocal @ECHO off +rem +rem Copyright (c) Microsoft. All rights reserved. +rem Licensed under the MIT license. See LICENSE file in the project root for full license information. +rem + SET CMDHOME=%~dp0 @REM Remove trailing backslash \ set CMDHOME=%CMDHOME:~0,-1% diff --git a/csharp/Clean.cmd b/csharp/Clean.cmd index e8454b6b..29a6e3cb 100644 --- a/csharp/Clean.cmd +++ b/csharp/Clean.cmd @@ -1,3 +1,9 @@ @ECHO OFF + +rem +rem Copyright (c) Microsoft. All rights reserved. +rem Licensed under the MIT license. See LICENSE file in the project root for full license information. +rem + FOR /D /R . %%G IN (bin) DO @IF EXIST "%%G" (@echo RDMR /S /Q "%%G" & rd /s /q "%%G") FOR /D /R . %%G IN (obj) DO @IF EXIST "%%G" (@echo RDMR /S /Q "%%G" & rd /s /q "%%G") \ No newline at end of file diff --git a/csharp/Test.cmd b/csharp/Test.cmd index cd1eafab..aaae4559 100644 --- a/csharp/Test.cmd +++ b/csharp/Test.cmd @@ -1,6 +1,11 @@ @setlocal @ECHO off +rem +rem Copyright (c) Microsoft. All rights reserved. +rem Licensed under the MIT license. See LICENSE file in the project root for full license information. +rem + SET CMDHOME=%~dp0 @REM Remove trailing backslash \ set CMDHOME=%CMDHOME:~0,-1% diff --git a/csharp/build.sh b/csharp/build.sh index 01364378..769f3226 100755 --- a/csharp/build.sh +++ b/csharp/build.sh @@ -1,5 +1,10 @@ #!/bin/bash +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + export FWDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" export CppDll=NoCpp export XBUILDOPT=/verbosity:minimal diff --git a/csharp/clean.sh b/csharp/clean.sh index 33edea39..255e4e6b 100755 --- a/csharp/clean.sh +++ b/csharp/clean.sh @@ -1,5 +1,10 @@ #!/bin/bash +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + for g in `find . -type d -name bin` do rm -r -f "$g" diff --git a/csharp/test.sh b/csharp/test.sh index 614898d7..7aabbad7 100755 --- a/csharp/test.sh +++ b/csharp/test.sh @@ -1,5 +1,10 @@ #!/bin/bash +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + export FWDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" if [ "$NUNITCONSOLE" = "" ]; diff --git a/dev/scripts/SetSparkClrJarVersion.ps1 b/dev/scripts/SetSparkClrJarVersion.ps1 index 13536a7c..f8d06ead 100644 --- a/dev/scripts/SetSparkClrJarVersion.ps1 +++ b/dev/scripts/SetSparkClrJarVersion.ps1 @@ -1,3 +1,8 @@ +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + # # This script takes in "version" and "targetDir" (optional) parameters, update Spark-Clr jar # version reference in all scripts under "targetDir". diff --git a/dev/scripts/SetSparkClrNugetPackageVersion.ps1 b/dev/scripts/SetSparkClrNugetPackageVersion.ps1 index 90a4b727..25cfaddf 100644 --- a/dev/scripts/SetSparkClrNugetPackageVersion.ps1 +++ b/dev/scripts/SetSparkClrNugetPackageVersion.ps1 @@ -1,3 +1,8 @@ +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + # # This script takes in and "nuspecDir" and "version" parameters, update Mobius Nuget package # version diff --git a/dev/scripts/SetSparkClrPackageVersion.ps1 b/dev/scripts/SetSparkClrPackageVersion.ps1 index 3de5b4b4..4e53b09f 100644 --- a/dev/scripts/SetSparkClrPackageVersion.ps1 +++ b/dev/scripts/SetSparkClrPackageVersion.ps1 @@ -1,3 +1,8 @@ +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + # # This script takes in "version" and "targetDir" (optional) parameters, update Mobius Nuget package # version reference in all *.csproj and packages.config under "dir". diff --git a/dev/scripts/SetVersion.cmd b/dev/scripts/SetVersion.cmd index 144674a7..52efa44f 100644 --- a/dev/scripts/SetVersion.cmd +++ b/dev/scripts/SetVersion.cmd @@ -1,4 +1,10 @@ @echo OFF + +rem +rem Copyright (c) Microsoft. All rights reserved. +rem Licensed under the MIT license. See LICENSE file in the project root for full license information. +rem + setlocal enabledelayedexpansion IF "%1"=="" (goto :usage) diff --git a/examples/Build.cmd b/examples/Build.cmd index c641b0bc..f9b0bcb6 100644 --- a/examples/Build.cmd +++ b/examples/Build.cmd @@ -1,6 +1,11 @@ @setlocal @ECHO off +rem +rem Copyright (c) Microsoft. All rights reserved. +rem Licensed under the MIT license. See LICENSE file in the project root for full license information. +rem + SET CMDHOME=%~dp0 @REM Remove trailing backslash \ set CMDHOME=%CMDHOME:~0,-1% diff --git a/examples/Clean.cmd b/examples/Clean.cmd index e8454b6b..29a6e3cb 100644 --- a/examples/Clean.cmd +++ b/examples/Clean.cmd @@ -1,3 +1,9 @@ @ECHO OFF + +rem +rem Copyright (c) Microsoft. All rights reserved. +rem Licensed under the MIT license. See LICENSE file in the project root for full license information. +rem + FOR /D /R . %%G IN (bin) DO @IF EXIST "%%G" (@echo RDMR /S /Q "%%G" & rd /s /q "%%G") FOR /D /R . %%G IN (obj) DO @IF EXIST "%%G" (@echo RDMR /S /Q "%%G" & rd /s /q "%%G") \ No newline at end of file diff --git a/examples/build.sh b/examples/build.sh index eed3f7fb..a86e96ae 100755 --- a/examples/build.sh +++ b/examples/build.sh @@ -1,5 +1,10 @@ #!/bin/bash +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + export FWDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" export XBUILDOPT=/verbosity:minimal diff --git a/examples/clean.sh b/examples/clean.sh index 33edea39..255e4e6b 100755 --- a/examples/clean.sh +++ b/examples/clean.sh @@ -1,5 +1,10 @@ #!/bin/bash +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + for g in `find . -type d -name bin` do rm -r -f "$g" diff --git a/scripts/sparkclr-submit.cmd b/scripts/sparkclr-submit.cmd index b880a1bf..68fadd80 100644 --- a/scripts/sparkclr-submit.cmd +++ b/scripts/sparkclr-submit.cmd @@ -1,4 +1,10 @@ @echo off + +rem +rem Copyright (c) Microsoft. All rights reserved. +rem Licensed under the MIT license. See LICENSE file in the project root for full license information. +rem + setlocal enabledelayedexpansion set CMDHOME=%~dp0 diff --git a/scripts/sparkclr-submit.sh b/scripts/sparkclr-submit.sh index 72383b5e..9482ec34 100755 --- a/scripts/sparkclr-submit.sh +++ b/scripts/sparkclr-submit.sh @@ -1,5 +1,10 @@ #!/bin/bash +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project root for full license information. +# + function spark_home_error() { echo "[sparkclr-submit.sh] Error - SPARK_HOME environment variable is not export" echo "[sparkclr-submit.sh] Note that SPARK_HOME environment variable should not have trailing /" From 7a6d6bfa22f616e64bc7bd6f3e095f43373254b9 Mon Sep 17 00:00:00 2001 From: Hebin Huang Date: Fri, 2 Sep 2016 14:06:40 -0700 Subject: [PATCH 03/15] Improve Worker logging to make CSharpWorkerFunc stacktrace clear (#559) --- .../Core/CSharpWorkerFunc.cs | 7 +- .../Services/DefaultLoggerService.cs | 25 +++--- .../Services/ILoggerService.cs | 5 ++ .../Services/Log4NetLoggerService.cs | 14 ++-- .../Microsoft.Spark.CSharp.Adapter.Doc.XML | 16 ++++ csharp/AdapterTest/ByteBufTest.cs | 3 + .../Worker/Microsoft.Spark.CSharp/Worker.cs | 84 ++++++++++--------- 7 files changed, 97 insertions(+), 57 deletions(-) diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Core/CSharpWorkerFunc.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Core/CSharpWorkerFunc.cs index 7101c19c..69bbf6a6 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/CSharpWorkerFunc.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/CSharpWorkerFunc.cs @@ -22,13 +22,14 @@ internal class CSharpWorkerFunc public CSharpWorkerFunc(Func, IEnumerable> func) { this.func = func; - stackTrace = new StackTrace(true).ToString(); + stackTrace = new StackTrace(true).ToString().Replace(" at ", " [STACK] "); } public CSharpWorkerFunc(Func, IEnumerable> func, string innerStackTrace) + : this(func) { - this.func = func; - stackTrace = new StackTrace(true).ToString() + "\nInner stack trace ...\n" + innerStackTrace; + stackTrace += string.Format(" [STACK] --- Inner stack trace: ---{0}{1}", + Environment.NewLine, innerStackTrace.Replace(" at ", " [STACK] ")); } public Func, IEnumerable> Func diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Services/DefaultLoggerService.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Services/DefaultLoggerService.cs index 8328d517..9588e63a 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Services/DefaultLoggerService.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Services/DefaultLoggerService.cs @@ -1,8 +1,4 @@ using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; namespace Microsoft.Spark.CSharp.Services { @@ -12,7 +8,20 @@ namespace Microsoft.Spark.CSharp.Services /// public class DefaultLoggerService : ILoggerService { - internal readonly static DefaultLoggerService Instance = new DefaultLoggerService(typeof (Type)); + internal static readonly DefaultLoggerService Instance = new DefaultLoggerService(typeof(Type)); + private readonly Type type; + + private DefaultLoggerService(Type t) + { + type = t; + } + + /// + /// Gets a value indicating whether logging is enabled for the Debug level. + /// Always return true for the DefaultLoggerService object. + /// + public bool IsDebugEnabled { get { return true; } } + /// /// Get an instance of ILoggerService by a given type of logger /// @@ -22,12 +31,6 @@ public ILoggerService GetLoggerInstance(Type type) { return new DefaultLoggerService(type); } - - private readonly Type type; - private DefaultLoggerService(Type t) - { - type = t; - } /// /// Logs a message at debug level. diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Services/ILoggerService.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Services/ILoggerService.cs index 5560df3e..714a31ac 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Services/ILoggerService.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Services/ILoggerService.cs @@ -7,6 +7,11 @@ namespace Microsoft.Spark.CSharp.Services /// public interface ILoggerService { + /// + /// Gets a value indicating whether logging is enabled for the Debug level. + /// + bool IsDebugEnabled { get; } + /// /// Get an instance of ILoggerService by a given type of logger /// diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Services/Log4NetLoggerService.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Services/Log4NetLoggerService.cs index 1b6ac76d..716c319d 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Services/Log4NetLoggerService.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Services/Log4NetLoggerService.cs @@ -1,10 +1,6 @@ using System; -using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; -using System.Linq; -using System.Text; using System.Diagnostics; -using System.Threading.Tasks; using log4net; using log4net.Config; @@ -35,7 +31,15 @@ static Log4NetLoggerService() public Log4NetLoggerService(Type type) { logger = LogManager.GetLogger(type); - log4net.GlobalContext.Properties["pid"] = Process.GetCurrentProcess().Id; + GlobalContext.Properties["pid"] = Process.GetCurrentProcess().Id; + } + + /// + /// Gets a value indicating whether logging is enabled for the Debug level. + /// + public bool IsDebugEnabled + { + get { return logger.IsDebugEnabled; } } /// diff --git a/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML b/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML index f7220aa2..5a445d12 100644 --- a/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML +++ b/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML @@ -4282,6 +4282,12 @@ Right now it just prints out the messages to Console + + + Gets a value indicating whether logging is enabled for the Debug level. + Always return true for the DefaultLoggerService object. + + Get an instance of ILoggerService by a given type of logger @@ -4365,6 +4371,11 @@ Defines a logger what be used in service + + + Gets a value indicating whether logging is enabled for the Debug level. + + Get an instance of ILoggerService by a given type of logger @@ -4459,6 +4470,11 @@ The type of the logger + + + Gets a value indicating whether logging is enabled for the Debug level. + + Logs a message at debug level. diff --git a/csharp/AdapterTest/ByteBufTest.cs b/csharp/AdapterTest/ByteBufTest.cs index 28a7453e..88d8543c 100644 --- a/csharp/AdapterTest/ByteBufTest.cs +++ b/csharp/AdapterTest/ByteBufTest.cs @@ -91,6 +91,9 @@ public void TestWriteReadUnsafeBuf() [Test] public void TestInvalidByteBuf() { + // Test ByteBuf with error status. + var errorByteBuf = ByteBuf.NewErrorStatusByteBuf(10054); + Assert.AreEqual(10054, errorByteBuf.Status); // Test invalid parameter to new ByteBuf. Assert.Throws(() => new ByteBuf(null, -1, 1024)); Assert.Throws(() => new ByteBuf(null, 0, -1)); diff --git a/csharp/Worker/Microsoft.Spark.CSharp/Worker.cs b/csharp/Worker/Microsoft.Spark.CSharp/Worker.cs index 3b2d3c0d..9c95d112 100644 --- a/csharp/Worker/Microsoft.Spark.CSharp/Worker.cs +++ b/csharp/Worker/Microsoft.Spark.CSharp/Worker.cs @@ -12,6 +12,7 @@ using System.Reflection; using System.Runtime.Serialization; using System.Runtime.Serialization.Formatters.Binary; +using System.Text; using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Interop.Ipc; using Microsoft.Spark.CSharp.Network; @@ -31,10 +32,8 @@ namespace Microsoft.Spark.CSharp public class Worker { private static readonly DateTime UnixTimeEpoch = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc); - - private static ILoggerService logger = null; - - private static SparkCLRAssemblyHandler assemblyHandler = null; + private static ILoggerService logger; + private static SparkCLRAssemblyHandler assemblyHandler; public static void Main(string[] args) { @@ -49,7 +48,7 @@ public static void Main(string[] args) if (args.Length != 2) { - Console.Error.WriteLine("Wrong number of args: {0}, will exit", args.Count()); + Console.Error.WriteLine("Wrong number of args: {0}, will exit", args.Length); Environment.Exit(-1); } @@ -126,7 +125,7 @@ private static ISocketWrapper InitializeSocket(int javaPort) public static bool ProcessStream(Stream inputStream, Stream outputStream, int splitIndex) { - logger.LogInfo(string.Format("Start of stream processing, splitIndex: {0}", splitIndex)); + logger.LogInfo("Start of stream processing, splitIndex: {0}", splitIndex); bool readComplete = true; // Whether all input data from the socket is read though completely try @@ -170,7 +169,7 @@ public static bool ProcessStream(Stream inputStream, Stream outputStream, int sp else { // This may happen when the input data is not read completely, e.g., when take() operation is performed - logger.LogWarn(string.Format("**** unexpected read: {0}, not all data is read", end)); + logger.LogWarn("**** unexpected read: {0}, not all data is read", end); // write a different value to tell JVM to not reuse this worker SerDe.Write(outputStream, (int)SpecialLengths.END_OF_DATA_SECTION); readComplete = false; @@ -179,8 +178,8 @@ public static bool ProcessStream(Stream inputStream, Stream outputStream, int sp outputStream.Flush(); // log bytes read and write - logger.LogDebug(string.Format("total read bytes: {0}", SerDe.totalReadNum)); - logger.LogDebug(string.Format("total write bytes: {0}", SerDe.totalWriteNum)); + logger.LogDebug("total read bytes: {0}", SerDe.totalReadNum); + logger.LogDebug("total write bytes: {0}", SerDe.totalWriteNum); logger.LogDebug("Stream processing completed successfully"); } @@ -202,10 +201,10 @@ public static bool ProcessStream(Stream inputStream, Stream outputStream, int sp logger.LogError("Writing exception to stream failed with exception:"); logger.LogException(ex); } - throw e; + throw; } - logger.LogInfo(string.Format("Stop of stream processing, splitIndex: {0}, readComplete: {1}", splitIndex, readComplete)); + logger.LogInfo("Stop of stream processing, splitIndex: {0}, readComplete: {1}", splitIndex, readComplete); return readComplete; } @@ -310,7 +309,6 @@ private static IFormatter ProcessCommand(Stream inputStream, Stream outputStream int stageId = -1; string deserializerMode = null; string serializerMode = null; - CSharpWorkerFunc workerFunc = null; for (int funcIndex = 0; funcIndex < chainedFuncCount; funcIndex++) { int lengthOfCommandByteArray = SerDe.ReadInt(inputStream); @@ -319,17 +317,11 @@ private static IFormatter ProcessCommand(Stream inputStream, Stream outputStream if (lengthOfCommandByteArray > 0) { + CSharpWorkerFunc workerFunc; ReadCommand(inputStream, formatter, out stageId, out deserializerMode, out serializerMode, out workerFunc); - if (func == null) - { - func = workerFunc; - } - else - { - func = CSharpWorkerFunc.Chain(func, workerFunc); - } + func = func == null ? workerFunc : CSharpWorkerFunc.Chain(func, workerFunc); } else { @@ -387,11 +379,14 @@ private static void ReadCommand(Stream networkStream, IFormatter formatter, out workerFunc = (CSharpWorkerFunc)formatter.Deserialize(stream); - logger.LogDebug( + if (!logger.IsDebugEnabled) return; + var sb = new StringBuilder(Environment.NewLine); + sb.AppendLine( "------------------------ Printing stack trace of workerFunc for ** debugging ** ------------------------------"); - logger.LogDebug(workerFunc.StackTrace); - logger.LogDebug( + sb.AppendLine(workerFunc.StackTrace); + sb.AppendLine( "--------------------------------------------------------------------------------------------------------------"); + logger.LogDebug(sb.ToString()); } private static void ExecuteCommand(Stream inputStream, Stream outputStream, int splitIndex, DateTime bootTime, @@ -442,9 +437,8 @@ private static void ExecuteCommand(Stream inputStream, Stream outputStream, int commandProcessWatch.Stop(); // log statistics - logger.LogInfo(string.Format("func process time: {0}", funcProcessWatch.ElapsedMilliseconds)); - logger.LogInfo(string.Format("stage {0}, command process time: {1}", stageId, - commandProcessWatch.ElapsedMilliseconds)); + logger.LogInfo("func process time: {0}", funcProcessWatch.ElapsedMilliseconds); + logger.LogInfo("stage {0}, command process time: {1}", stageId, commandProcessWatch.ElapsedMilliseconds); } private static void WriteOutput(Stream networkStream, string serializerMode, dynamic message, IFormatter formatter) @@ -509,7 +503,7 @@ private static int ReadDiagnosticsInfo(Stream networkStream) int rddId = SerDe.ReadInt(networkStream); int stageId = SerDe.ReadInt(networkStream); int partitionId = SerDe.ReadInt(networkStream); - logger.LogInfo(string.Format("rddInfo: rddId {0}, stageId {1}, partitionId {2}", rddId, stageId, partitionId)); + logger.LogInfo("rddInfo: rddId {0}, stageId {1}, partitionId {2}", rddId, stageId, partitionId); return stageId; } @@ -517,8 +511,8 @@ private static void WriteDiagnosticsInfo(Stream networkStream, DateTime bootTime { DateTime finishTime = DateTime.UtcNow; const string format = "MM/dd/yyyy hh:mm:ss.fff tt"; - logger.LogDebug(string.Format("bootTime: {0}, initTime: {1}, finish_time: {2}", - bootTime.ToString(format), initTime.ToString(format), finishTime.ToString(format))); + logger.LogDebug("bootTime: {0}, initTime: {1}, finish_time: {2}", + bootTime.ToString(format), initTime.ToString(format), finishTime.ToString(format)); SerDe.Write(networkStream, (int)SpecialLengths.TIMING_DATA); SerDe.Write(networkStream, ToUnixTime(bootTime)); SerDe.Write(networkStream, ToUnixTime(initTime)); @@ -538,7 +532,7 @@ private static void WriteAccumulatorValues(Stream networkStream, IFormatter form item.Value.GetType() .GetField("value", BindingFlags.NonPublic | BindingFlags.Instance) .GetValue(item.Value); - logger.LogDebug(string.Format("({0}, {1})", item.Key, value)); + logger.LogDebug("({0}, {1})", item.Key, value); formatter.Serialize(ms, new KeyValuePair(item.Key, value)); byte[] buffer = ms.ToArray(); SerDe.Write(networkStream, buffer.Length); @@ -548,13 +542,28 @@ private static void WriteAccumulatorValues(Stream networkStream, IFormatter form public static void PrintFiles() { - logger.LogDebug("Files available in executor"); - var driverFolder = Path.GetDirectoryName(Assembly.GetEntryAssembly().Location); - var files = Directory.EnumerateFiles(driverFolder); + if (!logger.IsDebugEnabled) return; + + var folder = Path.GetDirectoryName(Assembly.GetEntryAssembly().Location); + var files = Directory.EnumerateFiles(folder).Select(Path.GetFileName).ToArray(); + var longest = files.Max(f => f.Length); + var count = 0; + var outfiles = new StringBuilder(Environment.NewLine); foreach (var file in files) { - logger.LogDebug(file); + switch (count++ % 2) + { + case 0: + outfiles.Append(" " + file.PadRight(longest + 2)); + break; + default: + outfiles.AppendLine(file); + break; + } } + + logger.LogDebug("Files available in executor"); + logger.LogDebug("Location: {0}{1}{2}", folder, Environment.NewLine, outfiles.ToString()); } private static long ToUnixTime(DateTime dt) @@ -622,7 +631,7 @@ private static IEnumerable GetIterator(Stream inputStream, string seria case SerializedMode.Pair: { byte[] pairKey = buffer; - byte[] pairValue = null; + byte[] pairValue; watch.Start(); int valueLength = SerDe.ReadInt(inputStream); @@ -650,7 +659,6 @@ private static IEnumerable GetIterator(Stream inputStream, string seria break; } - case SerializedMode.Byte: default: { if (buffer != null) @@ -669,7 +677,7 @@ private static IEnumerable GetIterator(Stream inputStream, string seria watch.Start(); } - logger.LogInfo(string.Format("total receive time: {0}", watch.ElapsedMilliseconds)); + logger.LogInfo("total receive time: {0}", watch.ElapsedMilliseconds); } internal class SparkCLRAssemblyHandler @@ -687,7 +695,7 @@ public void LoadAssemblies(string[] files) } else { - Console.Error.WriteLine("Already loaded assebmly " + assembly.FullName); + Console.Error.WriteLine("Already loaded assembly " + assembly.FullName); } } } From da54d509ed3e27c011e994e6e399daff5b8df8f7 Mon Sep 17 00:00:00 2001 From: Kaarthik Sivashanmugam Date: Tue, 6 Sep 2016 13:11:59 -0700 Subject: [PATCH 04/15] added support for SparkSession, Catalog and Dataset --- .gitignore | 4 + .../Microsoft.Spark.CSharp/Adapter.csproj | 10 + .../Microsoft.Spark.CSharp/Core/SparkConf.cs | 25 + .../Core/SparkContext.cs | 20 + .../Proxy/ICatalogProxy.cs | 52 ++ .../Proxy/IDatasetProxy.cs | 16 + .../Proxy/ISparkConfProxy.cs | 1 + .../Proxy/ISparkContextProxy.cs | 3 +- .../Proxy/ISparkSessionProxy.cs | 27 ++ .../Proxy/ISqlContextProxy.cs | 1 - .../Proxy/Ipc/CatalogIpcProxy.cs | 154 ++++++ .../Proxy/Ipc/DatasetIpcProxy.cs | 35 ++ .../Proxy/Ipc/SparkConfIpcProxy.cs | 7 +- .../Proxy/Ipc/SparkContextIpcProxy.cs | 16 +- .../Proxy/Ipc/SparkSessionIpcProxy.cs | 101 ++++ .../Proxy/Ipc/SqlContextIpcProxy.cs | 6 - .../Microsoft.Spark.CSharp/Sql/Builder.cs | 130 +++++ .../Microsoft.Spark.CSharp/Sql/Catalog.cs | 350 ++++++++++++++ .../Microsoft.Spark.CSharp/Sql/DataFrame.cs | 14 +- .../Microsoft.Spark.CSharp/Sql/Dataset.cs | 135 ++++++ .../Microsoft.Spark.CSharp/Sql/HiveContext.cs | 11 +- .../Sql/SparkSession.cs | 140 ++++++ .../Microsoft.Spark.CSharp/Sql/SqlContext.cs | 38 +- .../Microsoft.Spark.CSharp.Adapter.Doc.XML | 453 ++++++++++++++++++ .../documentation/Mobius_API_Documentation.md | 98 +++- csharp/AdapterTest/AdapterTest.csproj | 5 + csharp/AdapterTest/BuilderTest.cs | 50 ++ csharp/AdapterTest/CatalogTest.cs | 212 ++++++++ csharp/AdapterTest/DatasetTest.cs | 150 ++++++ csharp/AdapterTest/HiveContextTest.cs | 34 +- .../AdapterTest/Mocks/MockSparkConfProxy.cs | 5 + .../Mocks/MockSparkContextProxy.cs | 13 +- .../Mocks/MockSparkSessionProxy.cs | 53 ++ .../AdapterTest/Mocks/MockSqlContextProxy.cs | 5 - csharp/AdapterTest/SparkSessionTest.cs | 30 ++ csharp/AdapterTest/SqlContextTest.cs | 66 ++- .../Samples/Microsoft.Spark.CSharp/App.config | 3 +- .../Microsoft.Spark.CSharp/CatalogSamples.cs | 28 ++ .../DataFrameSamples.cs | 10 +- .../Samples/Microsoft.Spark.CSharp/Program.cs | 1 + .../Microsoft.Spark.CSharp/Samples.csproj | 2 + .../SparkSessionSamples.cs | 189 ++++++++ .../spark/sql/api/csharp/JvmBridgeUtils.scala | 20 + .../spark/sql/api/csharp/SQLUtils.scala | 9 +- .../util/csharp/JvmBridgeUtilsSuite.scala | 31 ++ 45 files changed, 2671 insertions(+), 92 deletions(-) create mode 100644 csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ICatalogProxy.cs create mode 100644 csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IDatasetProxy.cs create mode 100644 csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkSessionProxy.cs create mode 100644 csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/CatalogIpcProxy.cs create mode 100644 csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/DatasetIpcProxy.cs create mode 100644 csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkSessionIpcProxy.cs create mode 100644 csharp/Adapter/Microsoft.Spark.CSharp/Sql/Builder.cs create mode 100644 csharp/Adapter/Microsoft.Spark.CSharp/Sql/Catalog.cs create mode 100644 csharp/Adapter/Microsoft.Spark.CSharp/Sql/Dataset.cs create mode 100644 csharp/Adapter/Microsoft.Spark.CSharp/Sql/SparkSession.cs create mode 100644 csharp/AdapterTest/BuilderTest.cs create mode 100644 csharp/AdapterTest/CatalogTest.cs create mode 100644 csharp/AdapterTest/DatasetTest.cs create mode 100644 csharp/AdapterTest/Mocks/MockSparkSessionProxy.cs create mode 100644 csharp/AdapterTest/SparkSessionTest.cs create mode 100644 csharp/Samples/Microsoft.Spark.CSharp/CatalogSamples.cs create mode 100644 csharp/Samples/Microsoft.Spark.CSharp/SparkSessionSamples.cs create mode 100644 scala/src/test/scala/org/apache/spark/util/csharp/JvmBridgeUtilsSuite.scala diff --git a/.gitignore b/.gitignore index 5ad71338..b42159a0 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,10 @@ build/dependencies/ *.log lib/ +# Local databases used for Dataset/frames # +########################################### +scala/metastore_db/ + # Generated Files # ############ SparkCLRCodeCoverage.xml diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Adapter.csproj b/csharp/Adapter/Microsoft.Spark.CSharp/Adapter.csproj index 56fb696d..4daf4aa5 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Adapter.csproj +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Adapter.csproj @@ -102,20 +102,25 @@ + + + + + @@ -125,6 +130,7 @@ + @@ -134,17 +140,21 @@ + + + + diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Core/SparkConf.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Core/SparkConf.cs index e7cdc161..42600236 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/SparkConf.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/SparkConf.cs @@ -2,6 +2,8 @@ // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System; +using System.Collections.Generic; +using System.Text.RegularExpressions; using Microsoft.Spark.CSharp.Configuration; using Microsoft.Spark.CSharp.Interop; using Microsoft.Spark.CSharp.Proxy; @@ -122,6 +124,29 @@ public string Get(string key, string defaultValue) { return sparkConfProxy.Get(key, defaultValue); } + + /// + /// Get all parameters as a list of pairs + /// + public Dictionary GetAll() + { + var configKvp = new Dictionary(); + var kvpStringCollection = sparkConfProxy.GetSparkConfAsString(); + var kvpStringArray = Regex.Split(kvpStringCollection, ";"); + foreach (var kvpString in kvpStringArray) + { + if (!string.IsNullOrEmpty(kvpString)) + { + var kvpItems = Regex.Split(kvpString, "="); + if (kvpItems.Length == 2 && !string.IsNullOrEmpty(kvpItems[0]) && !string.IsNullOrEmpty(kvpItems[1])) + { + configKvp.Add(kvpItems[0], kvpItems[1]); + } + } + } + + return configKvp; + } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Core/SparkContext.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Core/SparkContext.cs index f16220c0..bc8faac4 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/SparkContext.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/SparkContext.cs @@ -129,6 +129,7 @@ internal SparkContext(ISparkContextProxy sparkContextProxy, SparkConf conf) { SparkContextProxy = sparkContextProxy; SparkConf = conf; + _activeSparkContext = this; } private SparkContext(string master, string appName, string sparkHome, SparkConf conf) @@ -145,6 +146,25 @@ private SparkContext(string master, string appName, string sparkHome, SparkConf _activeSparkContext = this; } + /// + /// This function may be used to get or instantiate a SparkContext and register it as a + /// singleton object. Because we can only have one active SparkContext per JVM, + /// this is useful when applications may wish to share a SparkContext. + /// Note: This function cannot be used to create multiple SparkContext instances + /// even if multiple contexts are allowed. + /// + /// + /// + public static SparkContext GetOrCreate(SparkConf conf) + { + if (_activeSparkContext == null) + { + _activeSparkContext = new SparkContext(conf); + } + + return _activeSparkContext; + } + internal void StartAccumulatorServer() { if (accumulatorServer == null) diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ICatalogProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ICatalogProxy.cs new file mode 100644 index 00000000..95570cb1 --- /dev/null +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ICatalogProxy.cs @@ -0,0 +1,52 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Sql; +using Microsoft.Spark.CSharp.Sql.Catalog; +using Column = Microsoft.Spark.CSharp.Sql.Catalog.Column; + +namespace Microsoft.Spark.CSharp.Proxy +{ + interface ICatalogProxy + { + string CurrentDatabase { get; } + + void SetCurrentDatabase(string dbName); + + Dataset ListDatabases(); + + Dataset ListTables(string dbName); + + Dataset ListFunctions(string dbName); + + Dataset ListColumns(string tableName); + + Dataset ListColumns(string dbName, string tableName); + + void DropTempTable(string tableName); + + bool IsCached(string tableName); + + void CacheTable(string tableName); + + void UnCacheTable(string tableName); + + void RefreshTable(string tableName); + + void ClearCache(); + + DataFrame CreateExternalTable(string tableName, string path); + + DataFrame CreateExternalTable(string tableName, string path, string source); + + DataFrame CreateExternalTable(string tableName, string source, Dictionary options); + + DataFrame CreateExternalTable(string tableName, string source, StructType schema, + Dictionary options); + } +} diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IDatasetProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IDatasetProxy.cs new file mode 100644 index 00000000..4a760f85 --- /dev/null +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IDatasetProxy.cs @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Microsoft.Spark.CSharp.Proxy +{ + interface IDatasetProxy + { + IDataFrameProxy ToDF(); + } +} diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkConfProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkConfProxy.cs index 3c7069bf..2d7d4d11 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkConfProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkConfProxy.cs @@ -18,5 +18,6 @@ internal interface ISparkConfProxy void Set(string key, string value); int GetInt(string key, int defaultValue); string Get(string key, string defaultValue); + string GetSparkConfAsString(); } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkContextProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkContextProxy.cs index 51324332..2861e068 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkContextProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkContextProxy.cs @@ -15,8 +15,7 @@ namespace Microsoft.Spark.CSharp.Proxy internal interface ISparkContextProxy { ISparkConfProxy GetConf(); - ISqlContextProxy CreateSqlContext(); - ISqlContextProxy CreateHiveContext(); + ISparkSessionProxy CreateSparkSession(); IColumnProxy CreateColumnFromName(string name); IColumnProxy CreateFunction(string name, object self); IColumnProxy CreateBinaryMathFunction(string name, object self, object other); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkSessionProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkSessionProxy.cs new file mode 100644 index 00000000..56f869cd --- /dev/null +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkSessionProxy.cs @@ -0,0 +1,27 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Sql; + +namespace Microsoft.Spark.CSharp.Proxy +{ + internal interface IUdfRegistration { } + + interface ISparkSessionProxy + { + ISqlContextProxy SqlContextProxy { get; } + IUdfRegistration Udf { get; } + ICatalogProxy GetCatalog(); + IDataFrameReaderProxy Read(); + ISparkSessionProxy NewSession(); + IDataFrameProxy CreateDataFrame(IRDDProxy rddProxy, IStructTypeProxy structTypeProxy); + IDataFrameProxy Table(string tableName); + IDataFrameProxy Sql(string query); + void Stop(); + } +} diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISqlContextProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISqlContextProxy.cs index 3dd5a76f..60531295 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISqlContextProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISqlContextProxy.cs @@ -14,7 +14,6 @@ namespace Microsoft.Spark.CSharp.Proxy internal interface ISqlContextProxy { IDataFrameReaderProxy Read(); - ISqlContextProxy NewSession(); string GetConf(string key, string defaultValue); void SetConf(string key, string value); IDataFrameProxy CreateDataFrame(IRDDProxy rddProxy, IStructTypeProxy structTypeProxy); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/CatalogIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/CatalogIpcProxy.cs new file mode 100644 index 00000000..b0e60568 --- /dev/null +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/CatalogIpcProxy.cs @@ -0,0 +1,154 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Interop.Ipc; +using Microsoft.Spark.CSharp.Sql; +using Microsoft.Spark.CSharp.Sql.Catalog; + +namespace Microsoft.Spark.CSharp.Proxy.Ipc +{ + [ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured + internal class CatalogIpcProxy : ICatalogProxy + { + private readonly JvmObjectReference jvmCatalogReference; + private readonly ISqlContextProxy sqlContextProxy; + + internal CatalogIpcProxy(JvmObjectReference jvmCatalogReference, ISqlContextProxy sqlContextProxy) + { + this.jvmCatalogReference = jvmCatalogReference; + this.sqlContextProxy = sqlContextProxy; + } + + public string CurrentDatabase + { + get + { + return SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "currentDatabase").ToString(); + } + } + + public void CacheTable(string tableName) + { + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "cacheTable", new object[] { tableName }); + } + + public void ClearCache() + { + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "clearCache"); + } + + public DataFrame CreateExternalTable(string tableName, string path) + { + return new DataFrame( + new DataFrameIpcProxy( + new JvmObjectReference( + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "createExternalTable", + new object[] {tableName, path}).ToString()), sqlContextProxy), SparkContext.GetActiveSparkContext()); + } + + public DataFrame CreateExternalTable(string tableName, string source, Dictionary options) + { + throw new NotImplementedException(); //TODO - implement + } + + public DataFrame CreateExternalTable(string tableName, string path, string source) + { + return new DataFrame( + new DataFrameIpcProxy( + new JvmObjectReference( + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "createExternalTable", + new object[] { tableName, path, source }).ToString()), sqlContextProxy), SparkContext.GetActiveSparkContext()); + } + + public DataFrame CreateExternalTable(string tableName, string source, StructType schema, Dictionary options) + { + throw new NotImplementedException(); //TODO - implement + } + + public void DropTempTable(string tableName) + { + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "dropTempView", new object[] { tableName }); + } + + public bool IsCached(string tableName) + { + return + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "isCached", + new object[] {tableName}).ToString().Equals("true", StringComparison.InvariantCultureIgnoreCase); + } + + public Dataset ListColumns(string tableName) + { + return new Dataset( + new DatasetIpcProxy( + new JvmObjectReference( + (string) + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "listColumns", + new object[] { tableName })), sqlContextProxy)); + } + + public Dataset ListColumns(string dbName, string tableName) + { + return new Dataset( + new DatasetIpcProxy( + new JvmObjectReference( + (string) + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "listColumns", + new object[] { dbName, tableName })), sqlContextProxy)); + } + + public Dataset ListDatabases() + { + return new Dataset( + new DatasetIpcProxy( + new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "listDatabases")), sqlContextProxy)); + } + + public Dataset ListFunctions(string dbName) + { + return new Dataset( + new DatasetIpcProxy( + new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "listFunctions", new object[] { dbName })), sqlContextProxy)); + } + + public Dataset
ListTables(string dbName = null) + { + if (dbName != null) + return new Dataset
( + new DatasetIpcProxy( + new JvmObjectReference( + (string) + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "listTables", + new object[] {dbName})), sqlContextProxy)); + else + return new Dataset
( + new DatasetIpcProxy( + new JvmObjectReference( + (string) + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "listTables")), + sqlContextProxy)); + } + + public void SetCurrentDatabase(string dbName) + { + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "setCurrentDatabase", new object[] { dbName }); + } + + public void UnCacheTable(string tableName) + { + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "uncacheTable", new object[] { tableName }); + } + + public void RefreshTable(string tableName) + { + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmCatalogReference, "refreshTable", new object[] { tableName }); + } + } +} diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/DatasetIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/DatasetIpcProxy.cs new file mode 100644 index 00000000..84b4c581 --- /dev/null +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/DatasetIpcProxy.cs @@ -0,0 +1,35 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Interop.Ipc; + +namespace Microsoft.Spark.CSharp.Proxy.Ipc +{ + [ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured + internal class DatasetIpcProxy : IDatasetProxy + { + private readonly JvmObjectReference jvmDatasetReference; + private readonly ISqlContextProxy sqlContextProxy; + + internal DatasetIpcProxy(JvmObjectReference jvmDatasetReference, ISqlContextProxy sqlContextProxy) + { + this.jvmDatasetReference = jvmDatasetReference; + this.sqlContextProxy = sqlContextProxy; + } + + public IDataFrameProxy ToDF() + { + return new DataFrameIpcProxy( + new JvmObjectReference( + (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmDatasetReference, "toDF")), + sqlContextProxy + ); + } + } +} diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkConfIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkConfIpcProxy.cs index a314082a..dbc9e083 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkConfIpcProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkConfIpcProxy.cs @@ -57,5 +57,10 @@ public string Get(string key, string defaultValue) { return SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSparkConfReference, "get", new object[] { key, defaultValue }).ToString(); } - } + + public string GetSparkConfAsString() + { + return SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.JvmBridgeUtils", "getSparkConfAsString").ToString(); + } +} } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkContextIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkContextIpcProxy.cs index 6521b8d9..2e0534e6 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkContextIpcProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkContextIpcProxy.cs @@ -38,17 +38,15 @@ public SparkContextIpcProxy(JvmObjectReference jvmSparkContextReference, JvmObje this.jvmSparkContextReference = jvmSparkContextReference; this.jvmJavaContextReference = jvmJavaContextReference; } - - public ISqlContextProxy CreateSqlContext() - { - return new SqlContextIpcProxy(new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "createSQLContext", new object[] { jvmSparkContextReference }))); - } - public ISqlContextProxy CreateHiveContext() + public ISparkSessionProxy CreateSparkSession() { - return new SqlContextIpcProxy(new JvmObjectReference( - (string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod( - "org.apache.spark.sql.api.csharp.SQLUtils", "createHiveContext", new object[] { jvmSparkContextReference }))); + return + new SparkSessionIpcProxy( + new JvmObjectReference( + (string) + SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", + "createSparkSession", new object[] {jvmSparkContextReference}))); } public void CreateSparkContext(string master, string appName, string sparkHome, ISparkConfProxy conf) diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkSessionIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkSessionIpcProxy.cs new file mode 100644 index 00000000..d134c086 --- /dev/null +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkSessionIpcProxy.cs @@ -0,0 +1,101 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Interop.Ipc; + +namespace Microsoft.Spark.CSharp.Proxy.Ipc +{ + [ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured + internal class SparkSessionIpcProxy : ISparkSessionProxy + { + private readonly JvmObjectReference jvmSparkSessionReference; + private readonly ISqlContextProxy sqlContextProxy; + + private readonly IUdfRegistration udfRegistration; + + public IUdfRegistration Udf + { + get + { + if (udfRegistration == null) + { + //TODO implementation needed + } + + return udfRegistration; + } + } + + public ISqlContextProxy SqlContextProxy + { + get { return sqlContextProxy; } + } + + public ICatalogProxy GetCatalog() + { + return new CatalogIpcProxy(new JvmObjectReference((string) SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSparkSessionReference, "catalog")), sqlContextProxy); + } + + internal SparkSessionIpcProxy(JvmObjectReference jvmSparkSessionReference) + { + this.jvmSparkSessionReference = jvmSparkSessionReference; + sqlContextProxy = new SqlContextIpcProxy(GetSqlContextReference()); + } + + private JvmObjectReference GetSqlContextReference() + { + return + new JvmObjectReference( + (string) SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "getSqlContext", new object[] { jvmSparkSessionReference })); + } + + public ISparkSessionProxy NewSession() + { + return new SparkSessionIpcProxy( + new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSparkSessionReference, "newSession"))); + } + + public IDataFrameReaderProxy Read() + { + var javaDataFrameReaderReference = SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSparkSessionReference, "read"); + return new DataFrameReaderIpcProxy(new JvmObjectReference(javaDataFrameReaderReference.ToString()), sqlContextProxy); + } + + public IDataFrameProxy CreateDataFrame(IRDDProxy rddProxy, IStructTypeProxy structTypeProxy) + { + var rdd = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "byteArrayRDDToAnyArrayRDD", + new object[] { (rddProxy as RDDIpcProxy).JvmRddReference }).ToString()); + + return new DataFrameIpcProxy( + new JvmObjectReference( + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSparkSessionReference, "applySchemaToPythonRDD", + new object[] { rdd, (structTypeProxy as StructTypeIpcProxy).JvmStructTypeReference }).ToString()), sqlContextProxy); + } + + public IDataFrameProxy Sql(string sqlQuery) + { + var javaDataFrameReference = SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSparkSessionReference, "sql", new object[] { sqlQuery }); + var javaObjectReferenceForDataFrame = new JvmObjectReference(javaDataFrameReference.ToString()); + return new DataFrameIpcProxy(javaObjectReferenceForDataFrame, sqlContextProxy); + } + + public IDataFrameProxy Table(string tableName) + { + return new DataFrameIpcProxy( + new JvmObjectReference( + (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSparkSessionReference, "table", + new object[] { tableName })), sqlContextProxy); + } + + public void Stop() + { + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSparkSessionReference, "stop"); + } + } +} diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SqlContextIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SqlContextIpcProxy.cs index e22d6877..4bb930fe 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SqlContextIpcProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SqlContextIpcProxy.cs @@ -112,12 +112,6 @@ public void RegisterFunction(string name, byte[] command, string returnType) SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(judf, "registerPython", new object[] { name, udf }); } - public ISqlContextProxy NewSession() - { - return new SqlContextIpcProxy( - new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "newSession"))); - } - public string GetConf(string key, string defaultValue) { return (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "getConf", new object[] { key, defaultValue }); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Builder.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Builder.cs new file mode 100644 index 00000000..24af064d --- /dev/null +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Builder.cs @@ -0,0 +1,130 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Core; + +namespace Microsoft.Spark.CSharp.Sql +{ + /// + /// The entry point to programming Spark with the Dataset and DataFrame API. + /// + public class Builder + { + internal Dictionary options = new Dictionary(); + + internal Builder() { } + + /// + /// Sets the Spark master URL to connect to, such as "local" to run locally, "local[4]" to + /// run locally with 4 cores, or "spark://master:7077" to run on a Spark standalone cluster. + /// + /// Master URL + public Builder Master(string master) + { + Config("spark.master", master); + return this; + } + + /// + /// Sets a name for the application, which will be shown in the Spark web UI. + /// If no application name is set, a randomly generated name will be used. + /// + /// Name of the app + public Builder AppName(string appName) + { + Config("spark.app.name", appName); + return this; + } + + /// + /// Sets a config option. Options set using this method are automatically propagated to + /// both SparkConf and SparkSession's own configuration. + /// + /// Key for the configuration + /// value of the configuration + public Builder Config(string key, string value) + { + options[key] = value; + return this; + } + + /// + /// Sets a config option. Options set using this method are automatically propagated to + /// both SparkConf and SparkSession's own configuration. + /// + /// Key for the configuration + /// value of the configuration + public Builder Config(string key, bool value) + { + options[key] = value.ToString(); + return this; + } + + /// + /// Sets a config option. Options set using this method are automatically propagated to + /// both SparkConf and SparkSession's own configuration. + /// + /// Key for the configuration + /// value of the configuration + public Builder Config(string key, double value) + { + options[key] = value.ToString(); + return this; + } + + /// + /// Sets a config option. Options set using this method are automatically propagated to + /// both SparkConf and SparkSession's own configuration. + /// + /// Key for the configuration + /// value of the configuration + public Builder Config(string key, long value) + { + options[key] = value.ToString(); + return this; + } + + /// + /// Sets a list of config options based on the given SparkConf + /// + public Builder Config(SparkConf conf) + { + foreach (var keyValuePair in conf.GetAll()) + { + options[keyValuePair.Key] = keyValuePair.Value; + } + + return this; + } + + /// + /// Enables Hive support, including connectivity to a persistent Hive metastore, support for + /// Hive serdes, and Hive user-defined functions. + /// + public Builder EnableHiveSupport() + { + return Config("spark.sql.catalogImplementation", "hive"); + } + + /// + /// Gets an existing [[SparkSession]] or, if there is no existing one, creates a new + /// one based on the options set in this builder. + /// + /// + public SparkSession GetOrCreate() + { + var sparkConf = new SparkConf(); + foreach (var option in options) + { + sparkConf.Set(option.Key, option.Value); + } + var sparkContext = SparkContext.GetOrCreate(sparkConf); + return SqlContext.GetOrCreate(sparkContext).SparkSession; + } + } +} diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Catalog.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Catalog.cs new file mode 100644 index 00000000..94859fcc --- /dev/null +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Catalog.cs @@ -0,0 +1,350 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Proxy; + +namespace Microsoft.Spark.CSharp.Sql.Catalog +{ + /// + /// Catalog interface for Spark. + /// + public class Catalog + { + ICatalogProxy catalogProxy; + + internal Catalog(ICatalogProxy catalogProxy) + { + this.catalogProxy = catalogProxy; + } + + /// + /// Returns the current default database in this session. + /// + public string CurrentDatabase + { + get { return catalogProxy.CurrentDatabase; } + } + + // TODO Enable these convenience functions if needed + /* + public List GetDatabasesList() + { + var rows = ListDatabases().Collect(); + var list = new List(); + foreach (var row in rows) + { + list.Add(new Database + { + Name = row.Get("name"), + Description = row.Get("description"), + LocationUri = row.Get("locationUri") + }); + } + + return list; + } + + public List
GetTablesList(string dbName = null) + { + var tables = ListTables(dbName).Collect(); + //iterate and construct Table + throw new NotImplementedException(); + } + + public List
GetColumnsList(string tableName, string dbName = null) + { + var tables = ListColumns(tableName, dbName).Collect(); + //iterate and construct Column + throw new NotImplementedException(); + } + + public List
GetFunctionsList(string dbName = null) + { + var tables = ListFunctions(dbName).Collect(); + //iterate and construct Table + throw new NotImplementedException(); + } + */ + + /// + /// Returns a list of databases available across all sessions. + /// + /// + public DataFrame ListDatabases() + { + return catalogProxy.ListDatabases().ToDF(); + } + + /// + /// Returns a list of tables in the current database or given database + /// This includes all temporary tables. + /// + /// Optional database name. If not provided, current database is used + public DataFrame ListTables(string dbName = null) + { + return catalogProxy.ListTables(dbName ?? CurrentDatabase).ToDF(); + } + + /// + /// Returns a list of columns for the given table in the current database or + /// the given temporary table. + /// + /// Name of the table + /// Name of the database. If database is not provided, current database is used + public DataFrame ListColumns(string tableName, string dbName = null) + { + return catalogProxy.ListColumns(tableName, dbName ?? CurrentDatabase).ToDF(); + } + + /// + /// Returns a list of functions registered in the specified database. + /// This includes all temporary functions + /// + /// Name of the database. If database is not provided, current database is used + public DataFrame ListFunctions(string dbName = null) + { + return catalogProxy.ListFunctions(dbName ?? CurrentDatabase).ToDF(); + } + + /// + /// Sets the current default database in this session. + /// + /// Name of database + public void SetCurrentDatabase(string dbName) + { + catalogProxy.SetCurrentDatabase(dbName); + } + + /// + /// Drops the temporary view with the given view name in the catalog. + /// If the view has been cached before, then it will also be uncached. + /// + /// Name of the table + public void DropTempView(string tempViewName) + { + catalogProxy.DropTempTable(tempViewName); + } + + /// + /// Returns true if the table is currently cached in-memory. + /// + /// Name of the table + public bool IsCached(string tableName) + { + return catalogProxy.IsCached(tableName); + } + + /// + /// Caches the specified table in-memory. + /// + /// Name of the table + public void CacheTable(string tableName) + { + catalogProxy.CacheTable(tableName); + } + + /// + /// Removes the specified table from the in-memory cache. + /// + /// Name of the table + public void UnCacheTable(string tableName) + { + catalogProxy.UnCacheTable(tableName); + } + + /// + /// Invalidate and refresh all the cached metadata of the given table. For performance reasons, + /// Spark SQL or the external data source library it uses might cache certain metadata about a + /// table, such as the location of blocks.When those change outside of Spark SQL, users should + /// call this function to invalidate the cache. + /// If this table is cached as an InMemoryRelation, drop the original cached version and make the + /// new version cached lazily. + /// + /// Name of the table + public void RefreshTable(string tableName) + { + catalogProxy.RefreshTable(tableName); + } + + /// + /// Removes all cached tables from the in-memory cache. + /// + public void ClearCache() + { + catalogProxy.ClearCache(); + } + + /// + /// Creates an external table from the given path and returns the corresponding DataFrame. + /// It will use the default data source configured by spark.sql.sources.default. + /// + /// Name of the table + /// Path to table + public DataFrame CreateExternalTable(string tableName, string path) + { + return catalogProxy.CreateExternalTable(tableName, path); + } + + /// + /// Creates an external table from the given path on a data source and returns DataFrame + /// + /// Name of the table + /// Path to table + /// Data source + public DataFrame CreateExternalTable(string tableName, string path, string source) + { + return catalogProxy.CreateExternalTable(tableName, path, source); + } + + /// + /// Creates an external table from the given path based on a data source and a set of options. + /// Then, returns the corresponding DataFrame. + /// + /// Name of the table + /// Data source + /// Options to create table + /// + public DataFrame CreateExternalTable(string tableName, string source, Dictionary options) + { + return catalogProxy.CreateExternalTable(tableName, source, options); + } + + /// + /// Create an external table from the given path based on a data source, a schema and + /// a set of options.Then, returns the corresponding DataFrame. + /// + /// Name of the table + /// Data source + /// Schema of the table + /// Options to create table + /// + public DataFrame CreateExternalTable(string tableName, string source, StructType schema, Dictionary options) + { + return catalogProxy.CreateExternalTable(tableName, source, schema, options); + } + } + + /// + /// A database in Spark + /// + public class Database + { + /// + /// Name of the database + /// + public string Name { get; internal set; } + + /// + /// Desciption for the database + /// + public string Description { get; internal set; } + + /// + /// Location of the database + /// + public string LocationUri { get; internal set; } + } + + /// + /// A table in Spark + /// + public class Table + { + /// + /// Name of the table + /// + public string Name { get; internal set; } + + /// + /// Name of the database Table belongs to + /// + public string Database { get; internal set; } + + /// + /// Description of the table + /// + public string Description { get; internal set; } + + /// + /// Type of the table (table, view) + /// + public string TableType { get; internal set; } + + /// + /// Whether the table is a temporary table + /// + public bool IsTemporary { get; internal set; } + } + + /// + /// A column in Spark + /// + public class Column + { + /// + /// Name of the column + /// + public string Name { get; internal set; } + + /// + /// Datatype of the column + /// + public string DataType { get; internal set; } + + /// + /// Description of the column + /// + public string Description { get; internal set; } + + /// + /// Whether the column value can be null + /// + public bool IsNullable { get; internal set; } + + /// + /// Whether the column is a partition column. + /// + public bool IsPartition { get; internal set; } + + /// + /// Whether the column is a bucket column. + /// + public bool IsBucket { get; internal set; } + } + + /// + /// A user-defined function in Spark + /// + public class Function + { + /// + /// Name of the column + /// + public string Name { get; internal set; } + + /// + /// Name of the database + /// + public string Database { get; internal set; } + + /// + /// Description of the function + /// + public string Description { get; internal set; } + + /// + /// Fully qualified class name of the function + /// + public string ClassName { get; internal set; } + + /// + /// Whether the function is a temporary function or not. + /// + public bool IsTemporary { get; internal set; } + } +} diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrame.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrame.cs index 803655a9..66601ca2 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrame.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrame.cs @@ -170,11 +170,17 @@ public IEnumerable Collect() return Rdd.Collect(port).Cast(); } + //TODO - add this method if needed to convert Row to collection of T + //public IEnumerable Collect() + //{ + // throw new NotImplementedException(); + //} + /// - /// Converts the DataFrame to RDD of Row - /// - /// resulting RDD - public RDD ToRDD() //RDD created using byte representation of Row objects + /// Converts the DataFrame to RDD of Row + /// + /// resulting RDD + public RDD ToRDD() //RDD created using byte representation of Row objects { return Rdd; } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Dataset.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Dataset.cs new file mode 100644 index 00000000..b3a81cf0 --- /dev/null +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Dataset.cs @@ -0,0 +1,135 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Proxy; + +namespace Microsoft.Spark.CSharp.Sql +{ + /// + /// Dataset is a strongly typed collection of domain-specific objects that can be transformed + /// in parallel using functional or relational operations.Each Dataset also has an untyped view + /// called a DataFrame, which is a Dataset of Row. + /// + public class Dataset + { + IDatasetProxy datasetProxy; + + internal Dataset(IDatasetProxy datasetProxy) + { + this.datasetProxy = datasetProxy; + } + + /// + /// Selects column based on the column name + /// + /// Name of the column + /// + public Column this[string columnName] + { + get { return ToDF()[columnName]; } + } + + private DataFrame dataFrame; + + /// + /// Converts this strongly typed collection of data to generic Dataframe. In contrast to the + /// strongly typed objects that Dataset operations work on, a Dataframe returns generic[[Row]] + /// objects that allow fields to be accessed by ordinal or name. + /// + /// DataFrame created from Dataset + public DataFrame ToDF() + { + return dataFrame ?? (dataFrame = new DataFrame(datasetProxy.ToDF(), SparkContext.GetActiveSparkContext())); + } + + /// + /// Prints the schema to the console in a nice tree format. + /// + public void PrintSchema() + { + ToDF().ShowSchema(); + } + + /// + /// Prints the plans (logical and physical) to the console for debugging purposes. + /// + /// + public void Explain(bool extended) + { + ToDF().Explain(extended); + } + + /// + /// Prints the physical plan to the console for debugging purposes. + /// + public void Explain() + { + ToDF().Explain(); + } + + /// + /// Returns all column names and their data types as an array. + /// + public IEnumerable> DTypes() + { + return ToDF().DTypes(); + } + + /// + /// Returns all column names as an array. + /// + public IEnumerable Columns() + { + return ToDF().Columns(); + } + + /// + /// Displays the top 20 rows of Dataset in a tabular form. Strings more than 20 characters + /// will be truncated, and all cells will be aligned right. + /// + /// Number of rows - default is 20 + /// Indicates if rows with more than 20 characters to be truncated + public void Show(int numberOfRows = 20, bool truncate = true) + { + ToDF().Show(numberOfRows, truncate); + } + + /// + /// Prints schema + /// + public void ShowSchema() + { + ToDF().ShowSchema(); + } + } + + /// + /// Dataset of specific types + /// + /// Type parameter + public class Dataset : Dataset + { + internal Dataset(IDatasetProxy datasetProxy): base(datasetProxy) {} + + /************************************************************ + * Would it be useful to expose methods like the following? + * It would offer static type checking at the cost of runtime optimizations + * because C# functionality need to execute in CLR + ************************************************************ + + public Dataset Filter(Func func) + { + throw new NotImplementedException(); + } + + public Dataset Map(Func mapFunc) + { + throw new NotImplementedException(); + } + + */ + } +} diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/HiveContext.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/HiveContext.cs index 434ac076..9274cf5a 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/HiveContext.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/HiveContext.cs @@ -17,13 +17,8 @@ public class HiveContext : SqlContext /// Creates a HiveContext /// /// - public HiveContext(SparkContext sparkContext) - : base(sparkContext, sparkContext.SparkContextProxy.CreateHiveContext()) - { - } - - internal HiveContext(SparkContext sparkContext, ISqlContextProxy sqlContextProxy) - : base(sparkContext, sqlContextProxy) + public HiveContext(SparkContext sparkContext) + : base(SparkSession.Builder().Config(sparkContext.SparkConf).EnableHiveSupport().GetOrCreate()) { } @@ -36,7 +31,7 @@ internal HiveContext(SparkContext sparkContext, ISqlContextProxy sqlContextProxy /// public void RefreshTable(string tableName) { - SqlContextProxy.RefreshTable(tableName); + SparkSession.Catalog.RefreshTable(tableName); } } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SparkSession.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SparkSession.cs new file mode 100644 index 00000000..3ff8a8ab --- /dev/null +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SparkSession.cs @@ -0,0 +1,140 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Runtime.Remoting.Contexts; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Proxy; +using Microsoft.Spark.CSharp.Services; +using Microsoft.Spark.CSharp.Sql.Catalog; + +namespace Microsoft.Spark.CSharp.Sql +{ + /// + /// The entry point to programming Spark with the Dataset and DataFrame API. + /// + public class SparkSession + { + private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SparkSession)); + + private ISparkSessionProxy sparkSessionProxy; + private readonly SparkContext sparkContext; + + internal ISparkSessionProxy SparkSessionProxy + { + get { return sparkSessionProxy; } + //setter is used only for testing...//TODO - refactor + set { sparkSessionProxy = value; } + } + + private Catalog.Catalog catalog; + + /// + /// Interface through which the user may create, drop, alter or query underlying + /// databases, tables, functions etc. + /// + public Catalog.Catalog Catalog + { + get { return catalog ?? (catalog = new Catalog.Catalog(SparkSessionProxy.GetCatalog())); } + } + + internal SparkContext SparkContext + { + get { return sparkContext; } + } + + /// + /// Builder for SparkSession + /// + public static Builder Builder() + { + return new Builder(); + } + + internal SparkSession(SparkContext sparkContext) + { + sparkSessionProxy = sparkContext.SparkContextProxy.CreateSparkSession(); + this.sparkContext = sparkContext; + } + + internal SparkSession(ISparkSessionProxy sparkSessionProxy) + { + this.sparkSessionProxy = sparkSessionProxy; + } + + /// + /// Start a new session with isolated SQL configurations, temporary tables, registered + /// functions are isolated, but sharing the underlying [[SparkContext]] and cached data. + /// Note: Other than the [[SparkContext]], all shared state is initialized lazily. + /// This method will force the initialization of the shared state to ensure that parent + /// and child sessions are set up with the same shared state. If the underlying catalog + /// implementation is Hive, this will initialize the metastore, which may take some time. + /// + public SparkSession NewSession() + { + return new SparkSession(sparkSessionProxy.NewSession()); + } + + /// + /// Stop underlying SparkContext + /// + public void Stop() + { + sparkSessionProxy.Stop(); + } + + /// + /// Returns a DataFrameReader that can be used to read non-streaming data in as a DataFrame + /// + /// + public DataFrameReader Read() + { + logger.LogInfo("Using DataFrameReader to read input data from external data source"); + return new DataFrameReader(sparkSessionProxy.Read(), sparkContext); + } + + /// + /// Creates a from a RDD containing array of object using the given schema. + /// + /// RDD containing array of object. The array acts as a row and items within the array act as columns which the schema is specified in . + /// The schema of DataFrame. + /// + public DataFrame CreateDataFrame(RDD rdd, StructType schema) + { + // Note: This is for pickling RDD, convert to RDD which happens in CSharpWorker. + // The below sqlContextProxy.CreateDataFrame() will call byteArrayRDDToAnyArrayRDD() of SQLUtils.scala which only accept RDD of type RDD[Array[Byte]]. + // In byteArrayRDDToAnyArrayRDD() of SQLUtils.scala, the SerDeUtil.pythonToJava() will be called which is a mapPartitions inside. + // It will be executed until the CSharpWorker finishes Pickling to RDD[Array[Byte]]. + var rddRow = rdd.Map(r => r); + rddRow.serializedMode = SerializedMode.Row; + + return new DataFrame(sparkSessionProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), sparkContext); + } + + /// + /// Returns the specified table as a + /// + /// + /// + public DataFrame Table(string tableName) + { + return new DataFrame(sparkSessionProxy.Table(tableName), sparkContext); + } + + /// + /// Executes a SQL query using Spark, returning the result as a DataFrame. The dialect that is used for SQL parsing can be configured with 'spark.sql.dialect' + /// + /// + /// + public DataFrame Sql(string sqlQuery) + { + logger.LogInfo("SQL query to execute on the dataframe is {0}", sqlQuery); + return new DataFrame(sparkSessionProxy.Sql(sqlQuery), sparkContext); + } + } +} diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SqlContext.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SqlContext.cs index 829b14e6..4f1bf7aa 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SqlContext.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SqlContext.cs @@ -23,17 +23,38 @@ public class SqlContext private static SqlContext instance; + private SparkSession sparkSession; + private bool isRootContext; + + /// + /// Underlying SparkSession + /// + public SparkSession SparkSession + { + get { return sparkSession; } + } + + internal SqlContext(SparkSession sparkSession, bool isRootContext) + { + this.sparkSession = sparkSession; + this.isRootContext = isRootContext; + if (instance == null) instance = this; + } + + internal SqlContext(SparkSession sparkSession) : this(sparkSession, true) + { } + /// /// Creates a SqlContext /// /// - public SqlContext(SparkContext sparkContext) + public SqlContext(SparkContext sparkContext) : this(new SparkSession(sparkContext)) { + sqlContextProxy = sparkSession.SparkSessionProxy.SqlContextProxy; this.sparkContext = sparkContext; - sqlContextProxy = sparkContext.SparkContextProxy.CreateSqlContext(); - if (instance == null) instance = this; } + //TODO - remove this constructor after fixing unit tests that reference this internal SqlContext(SparkContext sparkContext, ISqlContextProxy sqlContextProxy) { this.sparkContext = sparkContext; @@ -62,8 +83,7 @@ public static SqlContext GetOrCreate(SparkContext sparkContext) /// public SqlContext NewSession() { - var newSessionProxy = sqlContextProxy.NewSession(); - return new SqlContext(this.sparkContext, newSessionProxy); + return new SqlContext(sparkSession.NewSession()); } /// @@ -75,7 +95,7 @@ public SqlContext NewSession() /// public string GetConf(string key, string defaultValue) { - return sqlContextProxy.GetConf(key, defaultValue); + return SparkSession.SparkSessionProxy.SqlContextProxy.GetConf(key, defaultValue); } /// @@ -85,7 +105,7 @@ public string GetConf(string key, string defaultValue) /// public void SetConf(string key, string value) { - sqlContextProxy.SetConf(key, value); + SparkSession.SparkSessionProxy.SqlContextProxy.SetConf(key, value); } /// @@ -155,7 +175,7 @@ public void DropTempTable(string tableName) /// public DataFrame Table(string tableName) { - return new DataFrame(sqlContextProxy.Table(tableName), sparkContext); + return SparkSession.Table(tableName); } /// @@ -230,7 +250,7 @@ public bool IsCached(string tableName) public DataFrame Sql(string sqlQuery) { logger.LogInfo("SQL query to execute on the dataframe is {0}", sqlQuery); - return new DataFrame(sqlContextProxy.Sql(sqlQuery), sparkContext); + return SparkSession.Sql(sqlQuery); } /// diff --git a/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML b/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML index 5a445d12..0304bdcc 100644 --- a/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML +++ b/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML @@ -1992,6 +1992,11 @@ Key to use Default value to use + + + Get all parameters as a list of pairs + + Main entry point for Spark functionality. A SparkContext represents the @@ -2072,6 +2077,17 @@ + + + This function may be used to get or instantiate a SparkContext and register it as a + singleton object. Because we can only have one active SparkContext per JVM, + this is useful when applications may wish to share a SparkContext. + Note: This function cannot be used to create multiple SparkContext instances + even if multiple contexts are allowed. + + + + Read a text file from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI, and return it as an RDD of Strings. @@ -4577,6 +4593,312 @@ + + + The entry point to programming Spark with the Dataset and DataFrame API. + + + + + Sets the Spark master URL to connect to, such as "local" to run locally, "local[4]" to + run locally with 4 cores, or "spark://master:7077" to run on a Spark standalone cluster. + + Master URL + + + + Sets a name for the application, which will be shown in the Spark web UI. + If no application name is set, a randomly generated name will be used. + + Name of the app + + + + Sets a config option. Options set using this method are automatically propagated to + both SparkConf and SparkSession's own configuration. + + Key for the configuration + value of the configuration + + + + Sets a config option. Options set using this method are automatically propagated to + both SparkConf and SparkSession's own configuration. + + Key for the configuration + value of the configuration + + + + Sets a config option. Options set using this method are automatically propagated to + both SparkConf and SparkSession's own configuration. + + Key for the configuration + value of the configuration + + + + Sets a config option. Options set using this method are automatically propagated to + both SparkConf and SparkSession's own configuration. + + Key for the configuration + value of the configuration + + + + Sets a list of config options based on the given SparkConf + + + + + Enables Hive support, including connectivity to a persistent Hive metastore, support for + Hive serdes, and Hive user-defined functions. + + + + + Gets an existing [[SparkSession]] or, if there is no existing one, creates a new + one based on the options set in this builder. + + + + + + Catalog interface for Spark. + + + + + Returns the current default database in this session. + + + + + Returns a list of databases available across all sessions. + + + + + + Returns a list of tables in the current database or given database + This includes all temporary tables. + + Optional database name. If not provided, current database is used + + + + Returns a list of columns for the given table in the current database or + the given temporary table. + + Name of the table + Name of the database. If database is not provided, current database is used + + + + Returns a list of functions registered in the specified database. + This includes all temporary functions + + Name of the database. If database is not provided, current database is used + + + + Sets the current default database in this session. + + Name of database + + + + Drops the temporary view with the given view name in the catalog. + If the view has been cached before, then it will also be uncached. + + Name of the table + + + + Returns true if the table is currently cached in-memory. + + Name of the table + + + + Caches the specified table in-memory. + + Name of the table + + + + Removes the specified table from the in-memory cache. + + Name of the table + + + + Invalidate and refresh all the cached metadata of the given table. For performance reasons, + Spark SQL or the external data source library it uses might cache certain metadata about a + table, such as the location of blocks.When those change outside of Spark SQL, users should + call this function to invalidate the cache. + If this table is cached as an InMemoryRelation, drop the original cached version and make the + new version cached lazily. + + Name of the table + + + + Removes all cached tables from the in-memory cache. + + + + + Creates an external table from the given path and returns the corresponding DataFrame. + It will use the default data source configured by spark.sql.sources.default. + + Name of the table + Path to table + + + + Creates an external table from the given path on a data source and returns DataFrame + + Name of the table + Path to table + Data source + + + + Creates an external table from the given path based on a data source and a set of options. + Then, returns the corresponding DataFrame. + + Name of the table + Data source + Options to create table + + + + + Create an external table from the given path based on a data source, a schema and + a set of options.Then, returns the corresponding DataFrame. + + Name of the table + Data source + Schema of the table + Options to create table + + + + + A database in Spark + + + + + Name of the database + + + + + Desciption for the database + + + + + Location of the database + + + + + A table in Spark + + + + + Name of the table + + + + + Name of the database Table belongs to + + + + + Description of the table + + + + + Type of the table (table, view) + + + + + Whether the table is a temporary table + + + + + A column in Spark + + + + + Name of the column + + + + + Datatype of the column + + + + + Description of the column + + + + + Whether the column value can be null + + + + + Whether the column is a partition column. + + + + + Whether the column is a bucket column. + + + + + A user-defined function in Spark + + + + + Name of the column + + + + + Name of the database + + + + + Description of the function + + + + + Fully qualified class name of the function + + + + + Whether the function is a temporary function or not. + + A column that will be computed based on the data in a DataFrame. @@ -5785,6 +6107,73 @@ Format("parquet").Save(path) + + + Dataset is a strongly typed collection of domain-specific objects that can be transformed + in parallel using functional or relational operations.Each Dataset also has an untyped view + called a DataFrame, which is a Dataset of Row. + + + + + Selects column based on the column name + + Name of the column + + + + + Converts this strongly typed collection of data to generic Dataframe. In contrast to the + strongly typed objects that Dataset operations work on, a Dataframe returns generic[[Row]] + objects that allow fields to be accessed by ordinal or name. + + DataFrame created from Dataset + + + + Prints the schema to the console in a nice tree format. + + + + + Prints the plans (logical and physical) to the console for debugging purposes. + + + + + + Prints the physical plan to the console for debugging purposes. + + + + + Returns all column names and their data types as an array. + + + + + Returns all column names as an array. + + + + + Displays the top 20 rows of Dataset in a tabular form. Strings more than 20 characters + will be truncated, and all cells will be aligned right. + + Number of rows - default is 20 + Indicates if rows with more than 20 characters to be truncated + + + + Prints schema + + + + + Dataset of specific types + + Type parameter + A variant of Spark SQL that integrates with data stored in Hive. @@ -6594,12 +6983,76 @@ The given SaveMode The string that represents the given SaveMode + + + The entry point to programming Spark with the Dataset and DataFrame API. + + + + + Interface through which the user may create, drop, alter or query underlying + databases, tables, functions etc. + + + + + Builder for SparkSession + + + + + Start a new session with isolated SQL configurations, temporary tables, registered + functions are isolated, but sharing the underlying [[SparkContext]] and cached data. + Note: Other than the [[SparkContext]], all shared state is initialized lazily. + This method will force the initialization of the shared state to ensure that parent + and child sessions are set up with the same shared state. If the underlying catalog + implementation is Hive, this will initialize the metastore, which may take some time. + + + + + Stop underlying SparkContext + + + + + Returns a DataFrameReader that can be used to read non-streaming data in as a DataFrame + + + + + + Creates a from a RDD containing array of object using the given schema. + + RDD containing array of object. The array acts as a row and items within the array act as columns which the schema is specified in . + The schema of DataFrame. + + + + + Returns the specified table as a + + + + + + + Executes a SQL query using Spark, returning the result as a DataFrame. The dialect that is used for SQL parsing can be configured with 'spark.sql.dialect' + + + + The entry point for working with structured data (rows and columns) in Spark. Allows the creation of [[DataFrame]] objects as well as the execution of SQL queries. + + + Underlying SparkSession + + Creates a SqlContext diff --git a/csharp/Adapter/documentation/Mobius_API_Documentation.md b/csharp/Adapter/documentation/Mobius_API_Documentation.md index 87b56ec2..6ad46d6e 100644 --- a/csharp/Adapter/documentation/Mobius_API_Documentation.md +++ b/csharp/Adapter/documentation/Mobius_API_Documentation.md @@ -311,7 +311,7 @@ ####Methods -
NameDescription
SetMasterThe master URL to connect to, such as "local" to run locally with one thread, "local[4]" to run locally with 4 cores, or "spark://master:7077" to run on a Spark standalone cluster.
SetAppNameSet a name for your application. Shown in the Spark web UI.
SetSparkHomeSet the location where Spark is installed on worker nodes.
SetSet the value of a string config
GetIntGet a int parameter value, falling back to a default if not set
GetGet a string parameter value, falling back to a default if not set
+
NameDescription
SetMasterThe master URL to connect to, such as "local" to run locally with one thread, "local[4]" to run locally with 4 cores, or "spark://master:7077" to run on a Spark standalone cluster.
SetAppNameSet a name for your application. Shown in the Spark web UI.
SetSparkHomeSet the location where Spark is installed on worker nodes.
SetSet the value of a string config
GetIntGet a int parameter value, falling back to a default if not set
GetGet a string parameter value, falling back to a default if not set
GetAllGet all parameters as a list of pairs
--- @@ -327,7 +327,7 @@ ####Methods -
NameDescription
GetActiveSparkContextGet existing SparkContext
GetConfReturn a copy of this JavaSparkContext's configuration. The configuration ''cannot'' be changed at runtime.
TextFileRead a text file from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI, and return it as an RDD of Strings.
Parallelize``1Distribute a local collection to form an RDD. sc.Parallelize(new int[] {0, 2, 3, 4, 6}, 5).Glom().Collect() [[0], [2], [3], [4], [6]]
EmptyRDDCreate an RDD that has no partitions or elements.
WholeTextFilesRead a directory of text files from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. Each file is read as a single record and returned in a key-value pair, where the key is the path of each file, the value is the content of each file. For example, if you have the following files: {{{ hdfs://a-hdfs-path/part-00000 hdfs://a-hdfs-path/part-00001 ... hdfs://a-hdfs-path/part-nnnnn }}} Do {{{ RDD<KeyValuePair<string, string>> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path") }}} then `rdd` contains {{{ (a-hdfs-path/part-00000, its content) (a-hdfs-path/part-00001, its content) ... (a-hdfs-path/part-nnnnn, its content) }}} Small files are preferred, large file is also allowable, but may cause bad performance. minPartitions A suggestion value of the minimal splitting number for input data.
BinaryFilesRead a directory of binary files from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI as a byte array. Each file is read as a single record and returned in a key-value pair, where the key is the path of each file, the value is the content of each file. For example, if you have the following files: {{{ hdfs://a-hdfs-path/part-00000 hdfs://a-hdfs-path/part-00001 ... hdfs://a-hdfs-path/part-nnnnn }}} Do RDD<KeyValuePair<string, byte[]>>"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`, then `rdd` contains {{{ (a-hdfs-path/part-00000, its content) (a-hdfs-path/part-00001, its content) ... (a-hdfs-path/part-nnnnn, its content) }}} @note Small files are preferred; very large files but may cause bad performance. @param minPartitions A suggestion value of the minimal splitting number for input data.
SequenceFileRead a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. The mechanism is as follows: 1. A Java RDD is created from the SequenceFile or other InputFormat, and the key and value Writable classes 2. Serialization is attempted via Pyrolite pickling 3. If this fails, the fallback is to call 'toString' on each key and value 4. PickleSerializer is used to deserialize pickled objects on the Python side
NewAPIHadoopFileRead a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. The mechanism is the same as for sc.sequenceFile. A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
NewAPIHadoopRDDRead a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration, which is passed in as a Python dict. This will be converted into a Configuration in Java. The mechanism is the same as for sc.sequenceFile.
HadoopFileRead an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. The mechanism is the same as for sc.sequenceFile. A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java.
HadoopRDDRead an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration, which is passed in as a Python dict. This will be converted into a Configuration in Java. The mechanism is the same as for sc.sequenceFile.
Union``1Build the union of a list of RDDs. This supports unions() of RDDs with different serialized formats, although this forces them to be reserialized using the default serializer: >>> path = os.path.join(tempdir, "union-text.txt") >>> with open(path, "w") as testFile: ... _ = testFile.write("Hello") >>> textFile = sc.textFile(path) >>> textFile.collect() [u'Hello'] >>> parallelized = sc.parallelize(["World!"]) >>> sorted(sc.union([textFile, parallelized]).collect()) [u'Hello', 'World!']
Broadcast``1Broadcast a read-only variable to the cluster, returning a Broadcast object for reading it in distributed functions. The variable will be sent to each cluster only once.
Accumulator``1Create an with the given initial value, using a given helper object to define how to add values of the data type if provided. Default AccumulatorParams are used for integers and floating-point numbers if you do not provide one. For other types, a custom AccumulatorParam can be used.
StopShut down the SparkContext.
AddFileAdd a file to be downloaded with this Spark job on every node. The `path` passed can be either a local file, a file in HDFS (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark jobs, use `SparkFiles.get(fileName)` to find its download location.
SetCheckpointDirSet the directory under which RDDs are going to be checkpointed. The directory must be a HDFS path if running on a cluster.
SetJobGroupAssigns a group ID to all the jobs started by this thread until the group ID is set to a different value or cleared. Often, a unit of execution in an application consists of multiple Spark actions or jobs. Application programmers can use this method to group all those jobs together and give a group description. Once set, the Spark web UI will associate such jobs with this group. The application can also use [[org.apache.spark.api.java.JavaSparkContext.cancelJobGroup]] to cancel all running jobs in this group. For example, {{{ // In the main thread: sc.setJobGroup("some_job_to_cancel", "some job description"); rdd.map(...).count(); // In a separate thread: sc.cancelJobGroup("some_job_to_cancel"); }}} If interruptOnCancel is set to true for the job group, then job cancellation will result in Thread.interrupt() being called on the job's executor threads. This is useful to help ensure that the tasks are actually stopped in a timely manner, but is off by default due to HDFS-1208, where HDFS may respond to Thread.interrupt() by marking nodes as dead.
SetLocalPropertySet a local property that affects jobs submitted from this thread, such as the Spark fair scheduler pool.
GetLocalPropertyGet a local property set in this thread, or null if it is missing. See [[org.apache.spark.api.java.JavaSparkContext.setLocalProperty]].
SetLogLevelControl our logLevel. This overrides any user-defined log settings. @param logLevel The desired log level as a string. Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
CancelJobGroupCancel active jobs for the specified group. See for more information.
CancelAllJobsCancel all jobs that have been scheduled or are running.
+
NameDescription
GetActiveSparkContextGet existing SparkContext
GetConfReturn a copy of this JavaSparkContext's configuration. The configuration ''cannot'' be changed at runtime.
GetOrCreateThis function may be used to get or instantiate a SparkContext and register it as a singleton object. Because we can only have one active SparkContext per JVM, this is useful when applications may wish to share a SparkContext. Note: This function cannot be used to create multiple SparkContext instances even if multiple contexts are allowed.
TextFileRead a text file from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI, and return it as an RDD of Strings.
Parallelize``1Distribute a local collection to form an RDD. sc.Parallelize(new int[] {0, 2, 3, 4, 6}, 5).Glom().Collect() [[0], [2], [3], [4], [6]]
EmptyRDDCreate an RDD that has no partitions or elements.
WholeTextFilesRead a directory of text files from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. Each file is read as a single record and returned in a key-value pair, where the key is the path of each file, the value is the content of each file. For example, if you have the following files: {{{ hdfs://a-hdfs-path/part-00000 hdfs://a-hdfs-path/part-00001 ... hdfs://a-hdfs-path/part-nnnnn }}} Do {{{ RDD<KeyValuePair<string, string>> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path") }}} then `rdd` contains {{{ (a-hdfs-path/part-00000, its content) (a-hdfs-path/part-00001, its content) ... (a-hdfs-path/part-nnnnn, its content) }}} Small files are preferred, large file is also allowable, but may cause bad performance. minPartitions A suggestion value of the minimal splitting number for input data.
BinaryFilesRead a directory of binary files from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI as a byte array. Each file is read as a single record and returned in a key-value pair, where the key is the path of each file, the value is the content of each file. For example, if you have the following files: {{{ hdfs://a-hdfs-path/part-00000 hdfs://a-hdfs-path/part-00001 ... hdfs://a-hdfs-path/part-nnnnn }}} Do RDD<KeyValuePair<string, byte[]>>"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`, then `rdd` contains {{{ (a-hdfs-path/part-00000, its content) (a-hdfs-path/part-00001, its content) ... (a-hdfs-path/part-nnnnn, its content) }}} @note Small files are preferred; very large files but may cause bad performance. @param minPartitions A suggestion value of the minimal splitting number for input data.
SequenceFileRead a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. The mechanism is as follows: 1. A Java RDD is created from the SequenceFile or other InputFormat, and the key and value Writable classes 2. Serialization is attempted via Pyrolite pickling 3. If this fails, the fallback is to call 'toString' on each key and value 4. PickleSerializer is used to deserialize pickled objects on the Python side
NewAPIHadoopFileRead a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. The mechanism is the same as for sc.sequenceFile. A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
NewAPIHadoopRDDRead a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration, which is passed in as a Python dict. This will be converted into a Configuration in Java. The mechanism is the same as for sc.sequenceFile.
HadoopFileRead an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. The mechanism is the same as for sc.sequenceFile. A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java.
HadoopRDDRead an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration, which is passed in as a Python dict. This will be converted into a Configuration in Java. The mechanism is the same as for sc.sequenceFile.
Union``1Build the union of a list of RDDs. This supports unions() of RDDs with different serialized formats, although this forces them to be reserialized using the default serializer: >>> path = os.path.join(tempdir, "union-text.txt") >>> with open(path, "w") as testFile: ... _ = testFile.write("Hello") >>> textFile = sc.textFile(path) >>> textFile.collect() [u'Hello'] >>> parallelized = sc.parallelize(["World!"]) >>> sorted(sc.union([textFile, parallelized]).collect()) [u'Hello', 'World!']
Broadcast``1Broadcast a read-only variable to the cluster, returning a Broadcast object for reading it in distributed functions. The variable will be sent to each cluster only once.
Accumulator``1Create an with the given initial value, using a given helper object to define how to add values of the data type if provided. Default AccumulatorParams are used for integers and floating-point numbers if you do not provide one. For other types, a custom AccumulatorParam can be used.
StopShut down the SparkContext.
AddFileAdd a file to be downloaded with this Spark job on every node. The `path` passed can be either a local file, a file in HDFS (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark jobs, use `SparkFiles.get(fileName)` to find its download location.
SetCheckpointDirSet the directory under which RDDs are going to be checkpointed. The directory must be a HDFS path if running on a cluster.
SetJobGroupAssigns a group ID to all the jobs started by this thread until the group ID is set to a different value or cleared. Often, a unit of execution in an application consists of multiple Spark actions or jobs. Application programmers can use this method to group all those jobs together and give a group description. Once set, the Spark web UI will associate such jobs with this group. The application can also use [[org.apache.spark.api.java.JavaSparkContext.cancelJobGroup]] to cancel all running jobs in this group. For example, {{{ // In the main thread: sc.setJobGroup("some_job_to_cancel", "some job description"); rdd.map(...).count(); // In a separate thread: sc.cancelJobGroup("some_job_to_cancel"); }}} If interruptOnCancel is set to true for the job group, then job cancellation will result in Thread.interrupt() being called on the job's executor threads. This is useful to help ensure that the tasks are actually stopped in a timely manner, but is off by default due to HDFS-1208, where HDFS may respond to Thread.interrupt() by marking nodes as dead.
SetLocalPropertySet a local property that affects jobs submitted from this thread, such as the Spark fair scheduler pool.
GetLocalPropertyGet a local property set in this thread, or null if it is missing. See [[org.apache.spark.api.java.JavaSparkContext.setLocalProperty]].
SetLogLevelControl our logLevel. This overrides any user-defined log settings. @param logLevel The desired log level as a string. Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
CancelJobGroupCancel active jobs for the specified group. See for more information.
CancelAllJobsCancel all jobs that have been scheduled or are running.
--- @@ -550,6 +550,62 @@ --- +###Microsoft.Spark.CSharp.Sql.Builder +####Summary + + + The entry point to programming Spark with the Dataset and DataFrame API. + + +####Methods + +
NameDescription
MasterSets the Spark master URL to connect to, such as "local" to run locally, "local[4]" to run locally with 4 cores, or "spark://master:7077" to run on a Spark standalone cluster.
AppNameSets a name for the application, which will be shown in the Spark web UI. If no application name is set, a randomly generated name will be used.
ConfigSets a config option. Options set using this method are automatically propagated to both SparkConf and SparkSession's own configuration.
ConfigSets a config option. Options set using this method are automatically propagated to both SparkConf and SparkSession's own configuration.
ConfigSets a config option. Options set using this method are automatically propagated to both SparkConf and SparkSession's own configuration.
ConfigSets a config option. Options set using this method are automatically propagated to both SparkConf and SparkSession's own configuration.
ConfigSets a list of config options based on the given SparkConf
EnableHiveSupportEnables Hive support, including connectivity to a persistent Hive metastore, support for Hive serdes, and Hive user-defined functions.
GetOrCreateGets an existing [[SparkSession]] or, if there is no existing one, creates a new one based on the options set in this builder.
+ +--- + + +###Microsoft.Spark.CSharp.Sql.Catalog.Catalog +####Summary + + + Catalog interface for Spark. + + +####Methods + +
NameDescription
ListDatabasesReturns a list of databases available across all sessions.
ListTablesReturns a list of tables in the current database or given database This includes all temporary tables.
ListColumnsReturns a list of columns for the given table in the current database or the given temporary table.
ListFunctionsReturns a list of functions registered in the specified database. This includes all temporary functions
SetCurrentDatabaseSets the current default database in this session.
DropTempViewDrops the temporary view with the given view name in the catalog. If the view has been cached before, then it will also be uncached.
IsCachedReturns true if the table is currently cached in-memory.
CacheTableCaches the specified table in-memory.
UnCacheTableRemoves the specified table from the in-memory cache.
RefreshTableInvalidate and refresh all the cached metadata of the given table. For performance reasons, Spark SQL or the external data source library it uses might cache certain metadata about a table, such as the location of blocks.When those change outside of Spark SQL, users should call this function to invalidate the cache. If this table is cached as an InMemoryRelation, drop the original cached version and make the new version cached lazily.
ClearCacheRemoves all cached tables from the in-memory cache.
CreateExternalTableCreates an external table from the given path and returns the corresponding DataFrame. It will use the default data source configured by spark.sql.sources.default.
CreateExternalTableCreates an external table from the given path on a data source and returns DataFrame
CreateExternalTableCreates an external table from the given path based on a data source and a set of options. Then, returns the corresponding DataFrame.
CreateExternalTableCreate an external table from the given path based on a data source, a schema and a set of options.Then, returns the corresponding DataFrame.
+ +--- + + +###Microsoft.Spark.CSharp.Sql.Catalog.Database +####Summary + + + A database in Spark + + +###Microsoft.Spark.CSharp.Sql.Catalog.Table +####Summary + + + A table in Spark + + +###Microsoft.Spark.CSharp.Sql.Catalog.Column +####Summary + + + A column in Spark + + +###Microsoft.Spark.CSharp.Sql.Catalog.Function +####Summary + + + A user-defined function in Spark + + ###Microsoft.Spark.CSharp.Sql.Column ####Summary @@ -647,6 +703,30 @@ --- +###Microsoft.Spark.CSharp.Sql.Dataset +####Summary + + + Dataset is a strongly typed collection of domain-specific objects that can be transformed + in parallel using functional or relational operations.Each Dataset also has an untyped view + called a DataFrame, which is a Dataset of Row. + + +####Methods + +
NameDescription
ToDFConverts this strongly typed collection of data to generic Dataframe. In contrast to the strongly typed objects that Dataset operations work on, a Dataframe returns generic[[Row]] objects that allow fields to be accessed by ordinal or name.
PrintSchemaPrints the schema to the console in a nice tree format.
ExplainPrints the plans (logical and physical) to the console for debugging purposes.
ExplainPrints the physical plan to the console for debugging purposes.
DTypesReturns all column names and their data types as an array.
ColumnsReturns all column names as an array.
ShowDisplays the top 20 rows of Dataset in a tabular form. Strings more than 20 characters will be truncated, and all cells will be aligned right.
ShowSchemaPrints schema
+ +--- + + +###Microsoft.Spark.CSharp.Sql.Dataset`1 +####Summary + + + Dataset of specific types + + Type parameter + ###Microsoft.Spark.CSharp.Sql.HiveContext ####Summary @@ -747,6 +827,20 @@ --- +###Microsoft.Spark.CSharp.Sql.SparkSession +####Summary + + + The entry point to programming Spark with the Dataset and DataFrame API. + + +####Methods + +
NameDescription
BuilderBuilder for SparkSession
NewSessionStart a new session with isolated SQL configurations, temporary tables, registered functions are isolated, but sharing the underlying [[SparkContext]] and cached data. Note: Other than the [[SparkContext]], all shared state is initialized lazily. This method will force the initialization of the shared state to ensure that parent and child sessions are set up with the same shared state. If the underlying catalog implementation is Hive, this will initialize the metastore, which may take some time.
StopStop underlying SparkContext
ReadReturns a DataFrameReader that can be used to read non-streaming data in as a DataFrame
CreateDataFrameCreates a from a RDD containing array of object using the given schema.
TableReturns the specified table as a
SqlExecutes a SQL query using Spark, returning the result as a DataFrame. The dialect that is used for SQL parsing can be configured with 'spark.sql.dialect'
+ +--- + + ###Microsoft.Spark.CSharp.Sql.SqlContext ####Summary diff --git a/csharp/AdapterTest/AdapterTest.csproj b/csharp/AdapterTest/AdapterTest.csproj index dfe00ea8..ca95b87c 100644 --- a/csharp/AdapterTest/AdapterTest.csproj +++ b/csharp/AdapterTest/AdapterTest.csproj @@ -65,10 +65,12 @@ + + @@ -76,6 +78,7 @@ + @@ -83,6 +86,7 @@ + @@ -91,6 +95,7 @@ + diff --git a/csharp/AdapterTest/BuilderTest.cs b/csharp/AdapterTest/BuilderTest.cs new file mode 100644 index 00000000..aae3c626 --- /dev/null +++ b/csharp/AdapterTest/BuilderTest.cs @@ -0,0 +1,50 @@ +using System; +using Microsoft.Spark.CSharp.Sql; +using NUnit.Framework; + +namespace AdapterTest +{ + [TestFixture] + public class BuilderTest + { + [Test] + public void TestMaster() + { + var builder = new Builder(); + builder.Master("test"); + Assert.AreEqual("test", builder.options["spark.master"]); + } + + [Test] + public void TestAppName() + { + var builder = new Builder(); + builder.AppName("test"); + Assert.AreEqual("test", builder.options["spark.app.name"]); + } + + [Test] + public void TestBoolConfig() + { + var builder = new Builder(); + builder.Config("boolvalue", true); + Assert.True(builder.options["boolvalue"].Equals("true", StringComparison.InvariantCultureIgnoreCase)); + } + + [Test] + public void TestLongConfig() + { + var builder = new Builder(); + builder.Config("longvalue", 3L); + Assert.True(builder.options["longvalue"].Equals("3", StringComparison.InvariantCultureIgnoreCase)); + } + + [Test] + public void TestDoubleConfig() + { + var builder = new Builder(); + builder.Config("doublevalue", 3.5D); + Assert.True(builder.options["doublevalue"].Equals("3.5", StringComparison.InvariantCultureIgnoreCase)); + } + } +} diff --git a/csharp/AdapterTest/CatalogTest.cs b/csharp/AdapterTest/CatalogTest.cs new file mode 100644 index 00000000..e1fbdf05 --- /dev/null +++ b/csharp/AdapterTest/CatalogTest.cs @@ -0,0 +1,212 @@ +using System; +using System.Collections.Generic; +using Microsoft.Spark.CSharp.Proxy; +using Microsoft.Spark.CSharp.Sql; +using Microsoft.Spark.CSharp.Sql.Catalog; +using Moq; +using NUnit.Framework; +using NUnit.Framework.Internal; +using Column = Microsoft.Spark.CSharp.Sql.Catalog.Column; + +namespace AdapterTest +{ + [TestFixture] + public class CatalogTest + { + [Test] + public void TestCurrentCatalog() + { + var mockCatalogProxy = new Mock(); + mockCatalogProxy.Setup(m => m.CurrentDatabase).Returns("currentdb"); + + var catalog = new Catalog(mockCatalogProxy.Object); + Assert.AreEqual("currentdb", catalog.CurrentDatabase); + } + + [Test] + public void TestGetDatabasesList() + { + var mockCatalogProxy = new Mock(); + var mockDatasetProxy = new Mock(); + var mockDataFrameProxy = new Mock(); + mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object); + mockCatalogProxy.Setup(m => m.ListDatabases()).Returns(new Dataset(mockDatasetProxy.Object)); + + var catalog = new Catalog(mockCatalogProxy.Object); + var databases = catalog.ListDatabases(); + Assert.AreSame(mockDataFrameProxy.Object, databases.DataFrameProxy); + } + + [Test] + public void TestGetTablesList() + { + var mockCatalogProxy = new Mock(); + var mockDatasetProxy = new Mock(); + var mockDataFrameProxy = new Mock(); + mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object); + mockCatalogProxy.Setup(m => m.ListTables(It.IsAny())).Returns(new Dataset(mockDatasetProxy.Object)); + + var catalog = new Catalog(mockCatalogProxy.Object); + var tables = catalog.ListTables(); + Assert.AreSame(mockDataFrameProxy.Object, tables.DataFrameProxy); + } + + [Test] + public void TestGetColumnsList() + { + var mockCatalogProxy = new Mock(); + var mockDatasetProxy = new Mock(); + var mockDataFrameProxy = new Mock(); + mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object); + mockCatalogProxy.Setup(m => m.ListColumns(It.IsAny(), It.IsAny())).Returns(new Dataset(mockDatasetProxy.Object)); + + var catalog = new Catalog(mockCatalogProxy.Object); + var columns = catalog.ListColumns("dbname"); + Assert.AreSame(mockDataFrameProxy.Object, columns.DataFrameProxy); + } + + [Test] + public void TestGetFunctionsList() + { + var mockCatalogProxy = new Mock(); + var mockDatasetProxy = new Mock(); + var mockDataFrameProxy = new Mock(); + mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object); + mockCatalogProxy.Setup(m => m.ListFunctions(It.IsAny())).Returns(new Dataset(mockDatasetProxy.Object)); + + var catalog = new Catalog(mockCatalogProxy.Object); + var columns = catalog.ListFunctions("dbname"); + Assert.AreSame(mockDataFrameProxy.Object, columns.DataFrameProxy); + } + + [Test] + public void TestSetCurrentDatabase() + { + var mockCatalogProxy = new Mock(); + var catalog = new Catalog(mockCatalogProxy.Object); + catalog.SetCurrentDatabase("dbname"); + mockCatalogProxy.Verify(m => m.SetCurrentDatabase("dbname"), Times.Once); + } + + [Test] + public void TestDropTempTable() + { + var mockCatalogProxy = new Mock(); + var catalog = new Catalog(mockCatalogProxy.Object); + catalog.DropTempView("tablename"); + mockCatalogProxy.Verify(m => m.DropTempTable("tablename"), Times.Once); + } + + [Test] + public void TestIsCached() + { + var mockCatalogProxy = new Mock(); + var catalog = new Catalog(mockCatalogProxy.Object); + mockCatalogProxy.Setup(m => m.IsCached(It.IsAny())).Returns(false); + var isCached = catalog.IsCached("tablename"); + mockCatalogProxy.Verify(m => m.IsCached(It.IsAny()), Times.Once); + Assert.False(isCached); + } + + [Test] + public void TestCacheTable() + { + var mockCatalogProxy = new Mock(); + var catalog = new Catalog(mockCatalogProxy.Object); + catalog.CacheTable("tablename"); + mockCatalogProxy.Verify(m => m.CacheTable("tablename"), Times.Once); + } + + [Test] + public void TestUnCacheTable() + { + var mockCatalogProxy = new Mock(); + var catalog = new Catalog(mockCatalogProxy.Object); + catalog.UnCacheTable("tablename"); + mockCatalogProxy.Verify(m => m.UnCacheTable("tablename"), Times.Once); + } + + [Test] + public void TestRefreshTable() + { + var mockCatalogProxy = new Mock(); + var catalog = new Catalog(mockCatalogProxy.Object); + catalog.RefreshTable("tablename"); + mockCatalogProxy.Verify(m => m.RefreshTable("tablename"), Times.Once); + } + + [Test] + public void TestClearCache() + { + var mockCatalogProxy = new Mock(); + var catalog = new Catalog(mockCatalogProxy.Object); + catalog.ClearCache(); + mockCatalogProxy.Verify(m => m.ClearCache(), Times.Once); + } + + [Test] + public void TestCreateExternalTable() + { + var mockCatalogProxy = new Mock(); + DataFrame dataFrame = null; + mockCatalogProxy.Setup(m => m.CreateExternalTable(It.IsAny(), It.IsAny())).Returns(dataFrame); + var catalog = new Catalog(mockCatalogProxy.Object); + var df = catalog.CreateExternalTable("tableName", "path"); + mockCatalogProxy.Verify(m => m.CreateExternalTable("tableName", "path"), Times.Once); + } + + [Test] + public void TestCreateExternalTable2() + { + var mockCatalogProxy = new Mock(); + DataFrame dataFrame = null; + mockCatalogProxy.Setup(m => m.CreateExternalTable(It.IsAny(), It.IsAny())).Returns(dataFrame); + var catalog = new Catalog(mockCatalogProxy.Object); + var df = catalog.CreateExternalTable("tableName", "path", "source"); + mockCatalogProxy.Verify(m => m.CreateExternalTable("tableName", "path", "source"), Times.Once); + } + + [Test] + public void TestDatabaseProperties() + { + var database = new Database {Description = "desc", Name = "name", LocationUri = "uri"}; + Assert.AreEqual("desc", database.Description); + Assert.AreEqual("name", database.Name); + Assert.AreEqual("uri", database.LocationUri); + } + + [Test] + public void TestTableProperties() + { + var table = new Table { Description = "desc", Name = "name", Database = "db", TableType = "type", IsTemporary = false}; + Assert.AreEqual("desc", table.Description); + Assert.AreEqual("name", table.Name); + Assert.AreEqual("db", table.Database); + Assert.AreEqual("type", table.TableType); + Assert.False(table.IsTemporary); + } + + [Test] + public void TestColumnProperties() + { + var column = new Column { Description = "desc", Name = "name", DataType = "dtype", IsNullable = true, IsPartition = false, IsBucket = true}; + Assert.AreEqual("desc", column.Description); + Assert.AreEqual("name", column.Name); + Assert.AreEqual("dtype", column.DataType); + Assert.False(column.IsPartition); + Assert.True(column.IsNullable); + Assert.True(column.IsBucket); + } + + [Test] + public void TestFunctionProperties() + { + var function = new Function { Description = "desc", Name = "name", Database = "db", ClassName = "classname", IsTemporary = false }; + Assert.AreEqual("desc", function.Description); + Assert.AreEqual("name", function.Name); + Assert.AreEqual("db", function.Database); + Assert.AreEqual("classname", function.ClassName); + Assert.False(function.IsTemporary); + } + } +} diff --git a/csharp/AdapterTest/DatasetTest.cs b/csharp/AdapterTest/DatasetTest.cs new file mode 100644 index 00000000..7ee59db9 --- /dev/null +++ b/csharp/AdapterTest/DatasetTest.cs @@ -0,0 +1,150 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using AdapterTest.Mocks; +using Microsoft.Spark.CSharp.Interop; +using Microsoft.Spark.CSharp.Proxy; +using Microsoft.Spark.CSharp.Sql; +using Moq; +using NUnit.Framework; + +namespace AdapterTest +{ + [TestFixture] + public class DatasetTest + { + private static Mock mockDatasetProxy; + + [OneTimeSetUp] + public static void ClassInitialize() + { + mockDatasetProxy = new Mock(); + } + + [SetUp] + public void TestInitialize() + { + mockDatasetProxy.Reset(); + } + + [TearDown] + public void TestCleanUp() + { + // Revert to use Static mock class to prevent blocking other test methods which uses static mock class + SparkCLREnvironment.SparkCLRProxy = new MockSparkCLRProxy(); + } + + [Test] + public void TestShow() + { + Mock mockDataFrameProxy = new Mock(); + mockDataFrameProxy.Setup(m => m.GetShowString(It.IsAny(), It.IsAny())).Returns("Show"); + mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object); + + var dataset = new Dataset(mockDatasetProxy.Object); + dataset.Show(); + mockDataFrameProxy.Verify(m => m.GetShowString(20, true), Times.Once); + } + + [Test] + public void TestExplain() + { + Mock mockDataFrameProxy = new Mock(); + mockDataFrameProxy.Setup(m => m.GetQueryExecution()).Returns("Execution Plan"); + mockDataFrameProxy.Setup(m => m.GetExecutedPlan()).Returns("Execution Plan"); + mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object); + + var dataset = new Dataset(mockDatasetProxy.Object); + dataset.Explain(); + mockDataFrameProxy.Verify(m => m.GetQueryExecution(), Times.Once); + + dataset.Explain(true); + mockDataFrameProxy.Verify(m => m.GetExecutedPlan(), Times.Once); + } + + [Test] + public void TestSchema() + { + TestSchema(true); + TestSchema(false); + } + + public void TestSchema(bool usePrintSchema) + { + var requestsSchema = new StructType(new List + { + new StructField("test", new StringType(), false), + }); + var jsonValue = requestsSchema.JsonValue.ToString(); + Mock mockStructTypeProxy = new Mock(); + mockStructTypeProxy.Setup(m => m.ToJson()).Returns(jsonValue); + Mock mockDataFrameProxy = new Mock(); + mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockStructTypeProxy.Object); + mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object); + + var dataset = new Dataset(mockDatasetProxy.Object); + + if (usePrintSchema) + dataset.PrintSchema(); + else + dataset.ShowSchema(); + + mockDataFrameProxy.Verify(m => m.GetSchema(), Times.Once); + mockStructTypeProxy.Verify(m => m.ToJson(), Times.Once()); + } + + [Test] + public void TestColumns() + { + var requestsSchema = new StructType(new List + { + new StructField("test", new StringType(), false), + }); + var x = requestsSchema.JsonValue.ToString(); + Mock mockStructTypeProxy = new Mock(); + mockStructTypeProxy.Setup(m => m.ToJson()).Returns(x); + Mock mockStructFieldProxy = new Mock(); + mockStructFieldProxy.Setup(m => m.GetStructFieldName()).Returns("testcol"); + mockStructTypeProxy.Setup(m => m.GetStructTypeFields()) + .Returns(new List() {mockStructFieldProxy.Object}); + Mock mockDataFrameProxy = new Mock(); + mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockStructTypeProxy.Object); + mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object); + + var dataset = new Dataset(mockDatasetProxy.Object); + var columns = dataset.Columns(); + Assert.AreEqual(1, columns.Count()); + Assert.AreEqual("testcol", columns.First()); + } + + [Test] + public void TestDTypes() + { + var requestsSchema = new StructType(new List + { + new StructField("test", new StringType(), false), + }); + var x = requestsSchema.JsonValue.ToString(); + Mock mockStructTypeProxy = new Mock(); + mockStructTypeProxy.Setup(m => m.ToJson()).Returns(x); + Mock mockStructFieldProxy = new Mock(); + mockStructFieldProxy.Setup(m => m.GetStructFieldName()).Returns("testcol"); + Mock mockStructDataTypeProxy = new Mock(); + mockStructDataTypeProxy.Setup(m => m.GetDataTypeSimpleString()).Returns("ss"); + mockStructFieldProxy.Setup(m => m.GetStructFieldDataType()).Returns(mockStructDataTypeProxy.Object); + mockStructTypeProxy.Setup(m => m.GetStructTypeFields()) + .Returns(new List() { mockStructFieldProxy.Object }); + Mock mockDataFrameProxy = new Mock(); + mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockStructTypeProxy.Object); + mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object); + + var dataset = new Dataset(mockDatasetProxy.Object); + var dTypes = dataset.DTypes(); + Assert.AreEqual(1, dTypes.Count()); + var first = dTypes.First(); + Assert.AreEqual("testcol", first.Item1); + Assert.AreEqual("ss", first.Item2); + } + + } +} diff --git a/csharp/AdapterTest/HiveContextTest.cs b/csharp/AdapterTest/HiveContextTest.cs index 8a67b2df..8e55f029 100644 --- a/csharp/AdapterTest/HiveContextTest.cs +++ b/csharp/AdapterTest/HiveContextTest.cs @@ -45,8 +45,21 @@ public void TestCleanUp() [Test] public void TestHiveContextConstructor() { - var hiveContext = new HiveContext(new SparkContext("", "")); - Assert.IsNotNull((hiveContext.SqlContextProxy as MockSqlContextProxy).mockSqlContextReference); + var mockSparkContextProxy = new Mock(); + + var mockSparkSessionProxy = new Mock(); + var mockCatalogProxy = new Mock(); + mockCatalogProxy.Setup(m => m.RefreshTable(It.IsAny())); + mockSparkSessionProxy.Setup(m => m.GetCatalog()).Returns(mockCatalogProxy.Object); + mockSparkContextProxy.Setup(m => m.CreateSparkSession()).Returns(mockSparkSessionProxy.Object); + + var mockSparkConfProxy = new Mock(); + mockSparkConfProxy.Setup(m => m.GetSparkConfAsString()) + .Returns("spark.master=master;spark.app.name=appname;config1=value1;config2=value2;"); + + var conf = new SparkConf(mockSparkConfProxy.Object); + var hiveContext = new HiveContext(new SparkContext(mockSparkContextProxy.Object, conf)); + Assert.IsNotNull(hiveContext.SparkSession); } [Test] @@ -54,14 +67,25 @@ public void TestHiveContextRefreshTable() { // arrange var mockSparkContextProxy = new Mock(); - mockSqlContextProxy.Setup(m => m.RefreshTable(It.IsAny())); - var hiveContext = new HiveContext(new SparkContext("", ""), mockSqlContextProxy.Object); + var mockSparkSessionProxy = new Mock(); + var mockCatalogProxy = new Mock(); + mockCatalogProxy.Setup(m => m.RefreshTable(It.IsAny())); + mockSparkSessionProxy.Setup(m => m.GetCatalog()).Returns(mockCatalogProxy.Object); + mockSparkContextProxy.Setup(m => m.CreateSparkSession()).Returns(mockSparkSessionProxy.Object); + + var mockSparkConfProxy = new Mock(); + mockSparkConfProxy.Setup(m => m.GetSparkConfAsString()) + .Returns("spark.master=master;spark.app.name=appname;config1=value1;config2=value2;"); + + var conf = new SparkConf(mockSparkConfProxy.Object); + var hiveContext = new HiveContext(new SparkContext(mockSparkContextProxy.Object, conf)); + hiveContext.SparkSession.SparkSessionProxy = mockSparkSessionProxy.Object; // act hiveContext.RefreshTable("table"); // assert - mockSqlContextProxy.Verify(m => m.RefreshTable("table")); + mockCatalogProxy.Verify(m => m.RefreshTable("table")); } } } diff --git a/csharp/AdapterTest/Mocks/MockSparkConfProxy.cs b/csharp/AdapterTest/Mocks/MockSparkConfProxy.cs index 2f8bd99b..3ce3bb5b 100644 --- a/csharp/AdapterTest/Mocks/MockSparkConfProxy.cs +++ b/csharp/AdapterTest/Mocks/MockSparkConfProxy.cs @@ -60,5 +60,10 @@ public string Get(string key, string defaultValue) } return defaultValue; } + + public string GetSparkConfAsString() + { + throw new NotImplementedException(); + } } } diff --git a/csharp/AdapterTest/Mocks/MockSparkContextProxy.cs b/csharp/AdapterTest/Mocks/MockSparkContextProxy.cs index b0f1fced..6a6b1d8b 100644 --- a/csharp/AdapterTest/Mocks/MockSparkContextProxy.cs +++ b/csharp/AdapterTest/Mocks/MockSparkContextProxy.cs @@ -293,19 +293,14 @@ public ISparkConfProxy GetConf() return new MockSparkConfProxy(); } - public ISqlContextProxy CreateSqlContext() - { - return new MockSqlContextProxy(this); - } - - public ISqlContextProxy CreateHiveContext() + public IRDDProxy Parallelize(IEnumerable values, int numSlices) { - return new MockSqlContextProxy(this); + return new MockRddProxy(null); } - public IRDDProxy Parallelize(IEnumerable values, int numSlices) + public ISparkSessionProxy CreateSparkSession() { - return new MockRddProxy(null); + return new MockSparkSessionProxy(); } } } diff --git a/csharp/AdapterTest/Mocks/MockSparkSessionProxy.cs b/csharp/AdapterTest/Mocks/MockSparkSessionProxy.cs new file mode 100644 index 00000000..da695c3f --- /dev/null +++ b/csharp/AdapterTest/Mocks/MockSparkSessionProxy.cs @@ -0,0 +1,53 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Proxy; + +namespace AdapterTest.Mocks +{ + class MockSparkSessionProxy : ISparkSessionProxy + { + public ISqlContextProxy SqlContextProxy { get { return new MockSqlContextProxy(new MockSparkContextProxy(new MockSparkConfProxy()));} } + public IUdfRegistration Udf { get; } + public ICatalogProxy GetCatalog() + { + throw new NotImplementedException(); + } + + public IDataFrameReaderProxy Read() + { + return new MockDataFrameReaderProxy(SqlContextProxy); + } + + internal ISparkSessionProxy InjectedSparkSessionProxy { get; set; } + public ISparkSessionProxy NewSession() + { + return InjectedSparkSessionProxy; + } + + public IDataFrameProxy CreateDataFrame(IRDDProxy rddProxy, IStructTypeProxy structTypeProxy) + { + throw new NotImplementedException(); + } + + public IDataFrameProxy Table(string tableName) + { + return new MockDataFrameProxy(new object[] { tableName }, null); + } + + public IDataFrameProxy Sql(string query) + { + return new MockDataFrameProxy(new object[] {query}, null); + } + + public void Stop() + { + throw new NotImplementedException(); + } + } +} diff --git a/csharp/AdapterTest/Mocks/MockSqlContextProxy.cs b/csharp/AdapterTest/Mocks/MockSqlContextProxy.cs index e89996de..4dd02787 100644 --- a/csharp/AdapterTest/Mocks/MockSqlContextProxy.cs +++ b/csharp/AdapterTest/Mocks/MockSqlContextProxy.cs @@ -69,11 +69,6 @@ public void RegisterFunction(string name, byte[] command, string returnType) throw new NotImplementedException(); } - public ISqlContextProxy NewSession() - { - throw new NotImplementedException(); - } - public string GetConf(string key, string defaultValue) { throw new NotImplementedException(); diff --git a/csharp/AdapterTest/SparkSessionTest.cs b/csharp/AdapterTest/SparkSessionTest.cs new file mode 100644 index 00000000..3dbd0af3 --- /dev/null +++ b/csharp/AdapterTest/SparkSessionTest.cs @@ -0,0 +1,30 @@ +using System; +using Microsoft.Spark.CSharp.Proxy; +using Microsoft.Spark.CSharp.Sql; +using Moq; +using NUnit.Framework; + +namespace AdapterTest +{ + [TestFixture] + public class SparkSessionTest + { + [Test] + public void TestRead() + { + var mockSparkSessionProxy = new Mock(); + var sparkSession = new SparkSession(mockSparkSessionProxy.Object); + var reader = sparkSession.Read(); + mockSparkSessionProxy.Verify(m => m.Read(), Times.Once); + } + + [Test] + public void TestStop() + { + var mockSparkSessionProxy = new Mock(); + var sparkSession = new SparkSession(mockSparkSessionProxy.Object); + sparkSession.Stop(); + mockSparkSessionProxy.Verify(m => m.Stop(), Times.Once); + } + } +} diff --git a/csharp/AdapterTest/SqlContextTest.cs b/csharp/AdapterTest/SqlContextTest.cs index faaeb8aa..a403b704 100644 --- a/csharp/AdapterTest/SqlContextTest.cs +++ b/csharp/AdapterTest/SqlContextTest.cs @@ -61,15 +61,16 @@ public void TestSqlContextGetOrCreate() public void TestSqlContextNewSession() { // arrange - var sessionProxy = new SqlContextIpcProxy(new JvmObjectReference("1")); - mockSqlContextProxy.Setup(m => m.NewSession()).Returns(sessionProxy); - var sqlContext = new SqlContext(new SparkContext("", ""), mockSqlContextProxy.Object); + var sparkSessionProxy = new Mock(); + var newSparkSessionProxy = new Mock(); // act - var actualNewSession = sqlContext.NewSession(); + sparkSessionProxy.Setup(m => m.NewSession()).Returns(newSparkSessionProxy.Object); + var sqlContext = new SqlContext(new SparkSession(sparkSessionProxy.Object)); + var ns = sqlContext.NewSession(); // assert - Assert.AreEqual(sessionProxy, actualNewSession.SqlContextProxy); + sparkSessionProxy.Verify(m => m.NewSession()); } [Test] @@ -79,9 +80,24 @@ public void TestSqlContextGetConf() const string key = "key"; const string value = "value"; mockSqlContextProxy.Setup(m => m.GetConf(key, "")).Returns(value); - var sqlContext = new SqlContext(new SparkContext("", ""), mockSqlContextProxy.Object); + var mockSparkContextProxy = new Mock(); - // act + var mockSparkSessionProxy = new Mock(); + var mockCatalogProxy = new Mock(); + mockCatalogProxy.Setup(m => m.RefreshTable(It.IsAny())); + mockSparkSessionProxy.Setup(m => m.GetCatalog()).Returns(mockCatalogProxy.Object); + mockSparkContextProxy.Setup(m => m.CreateSparkSession()).Returns(mockSparkSessionProxy.Object); + mockSparkSessionProxy.Setup(m => m.SqlContextProxy).Returns(mockSqlContextProxy.Object); + + var mockSparkConfProxy = new Mock(); + mockSparkConfProxy.Setup(m => m.GetSparkConfAsString()) + .Returns("spark.master=master;spark.app.name=appname;config1=value1;config2=value2;"); + + var conf = new SparkConf(mockSparkConfProxy.Object); + var sqlContext = new SqlContext(new SparkContext(mockSparkContextProxy.Object, conf)); + sqlContext.SparkSession.SparkSessionProxy = mockSparkSessionProxy.Object; + + //act var actualValue = sqlContext.GetConf(key, ""); // assert @@ -95,7 +111,22 @@ public void TestSqlContextSetConf() const string key = "key"; const string value = "value"; mockSqlContextProxy.Setup(m => m.SetConf(key, value)); - var sqlContext = new SqlContext(new SparkContext("", ""), mockSqlContextProxy.Object); + var mockSparkContextProxy = new Mock(); + + var mockSparkSessionProxy = new Mock(); + var mockCatalogProxy = new Mock(); + mockCatalogProxy.Setup(m => m.RefreshTable(It.IsAny())); + mockSparkSessionProxy.Setup(m => m.GetCatalog()).Returns(mockCatalogProxy.Object); + mockSparkContextProxy.Setup(m => m.CreateSparkSession()).Returns(mockSparkSessionProxy.Object); + mockSparkSessionProxy.Setup(m => m.SqlContextProxy).Returns(mockSqlContextProxy.Object); + + var mockSparkConfProxy = new Mock(); + mockSparkConfProxy.Setup(m => m.GetSparkConfAsString()) + .Returns("spark.master=master;spark.app.name=appname;config1=value1;config2=value2;"); + + var conf = new SparkConf(mockSparkConfProxy.Object); + var sqlContext = new SqlContext(new SparkContext(mockSparkContextProxy.Object, conf)); + sqlContext.SparkSession.SparkSessionProxy = mockSparkSessionProxy.Object; // act sqlContext.SetConf(key, value); @@ -175,16 +206,11 @@ public void TestSqlContextDropTempTable() [Test] public void TestSqlContextTable() { - // arrange - var sqlContext = new SqlContext(new SparkContext("", ""), mockSqlContextProxy.Object); - var dataFrameProxy = new DataFrameIpcProxy(new JvmObjectReference("1"), mockSqlContextProxy.Object); - mockSqlContextProxy.Setup(m => m.Table(It.IsAny())).Returns(dataFrameProxy); - - // act - var actualTableDataFrame = sqlContext.Table("table"); - - // assert - Assert.AreEqual(dataFrameProxy, actualTableDataFrame.DataFrameProxy); + var sqlContext = new SqlContext(new SparkContext("", "")); + string tableName = "TestTableName"; + var dataFrame = sqlContext.Table(tableName); + var paramValuesToTableMethod = (dataFrame.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference; + Assert.AreEqual(tableName, paramValuesToTableMethod[0]); } [Test] @@ -292,8 +318,8 @@ public void TestSqlContextSql() { var sqlContext = new SqlContext(new SparkContext("", "")); var dataFrame = sqlContext.Sql("Query of SQL text"); - var paramValuesToJsonFileMethod = (dataFrame.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference; - Assert.AreEqual("Query of SQL text", paramValuesToJsonFileMethod[0]); + var paramValuesToSqlMethod = (dataFrame.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference; + Assert.AreEqual("Query of SQL text", paramValuesToSqlMethod[0]); } [Test] diff --git a/csharp/Samples/Microsoft.Spark.CSharp/App.config b/csharp/Samples/Microsoft.Spark.CSharp/App.config index 7ac02fd5..64779e2f 100644 --- a/csharp/Samples/Microsoft.Spark.CSharp/App.config +++ b/csharp/Samples/Microsoft.Spark.CSharp/App.config @@ -58,8 +58,9 @@ + diff --git a/csharp/Samples/Microsoft.Spark.CSharp/CatalogSamples.cs b/csharp/Samples/Microsoft.Spark.CSharp/CatalogSamples.cs new file mode 100644 index 00000000..cdca7d46 --- /dev/null +++ b/csharp/Samples/Microsoft.Spark.CSharp/CatalogSamples.cs @@ -0,0 +1,28 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using NUnit.Framework; + +namespace Microsoft.Spark.CSharp.Samples +{ + class CatalogSamples + { + [Sample] + internal static void CatalogSample() + { + var catalog = SparkSessionSamples.GetSparkSession().Catalog; + var currentDatabase = catalog.CurrentDatabase; + var databasesList = SparkSessionSamples.GetSparkSession().Catalog.ListDatabases().Collect(); + + if (SparkCLRSamples.Configuration.IsValidationEnabled) + { + var defaultDatabase = databasesList.First(row => row.Get("name").Equals("default")); //throws exception if First() is missing + } + } + } +} diff --git a/csharp/Samples/Microsoft.Spark.CSharp/DataFrameSamples.cs b/csharp/Samples/Microsoft.Spark.CSharp/DataFrameSamples.cs index f0a691e9..5f4e5b49 100644 --- a/csharp/Samples/Microsoft.Spark.CSharp/DataFrameSamples.cs +++ b/csharp/Samples/Microsoft.Spark.CSharp/DataFrameSamples.cs @@ -16,11 +16,11 @@ namespace Microsoft.Spark.CSharp.Samples { class DataFrameSamples { - private const string PeopleJson = @"people.json"; - private const string OrderJson = @"order.json"; - private const string RequestsLog = @"requestslog.txt"; - private const string MetricsLog = @"metricslog.txt"; - private const string CSVTestLog = @"csvtestlog.txt"; + internal const string PeopleJson = @"people.json"; + internal const string OrderJson = @"order.json"; + internal const string RequestsLog = @"requestslog.txt"; + internal const string MetricsLog = @"metricslog.txt"; + internal const string CSVTestLog = @"csvtestlog.txt"; private static SqlContext sqlContext; diff --git a/csharp/Samples/Microsoft.Spark.CSharp/Program.cs b/csharp/Samples/Microsoft.Spark.CSharp/Program.cs index ec5dce6d..1f25fa26 100644 --- a/csharp/Samples/Microsoft.Spark.CSharp/Program.cs +++ b/csharp/Samples/Microsoft.Spark.CSharp/Program.cs @@ -78,6 +78,7 @@ private static SparkContext CreateSparkContext() { conf.Set("spark.local.dir", Configuration.SparkLocalDirectoryOverride); } + return new SparkContext(conf); } diff --git a/csharp/Samples/Microsoft.Spark.CSharp/Samples.csproj b/csharp/Samples/Microsoft.Spark.CSharp/Samples.csproj index 5dfb94da..880feb27 100644 --- a/csharp/Samples/Microsoft.Spark.CSharp/Samples.csproj +++ b/csharp/Samples/Microsoft.Spark.CSharp/Samples.csproj @@ -47,6 +47,7 @@ + @@ -61,6 +62,7 @@ + diff --git a/csharp/Samples/Microsoft.Spark.CSharp/SparkSessionSamples.cs b/csharp/Samples/Microsoft.Spark.CSharp/SparkSessionSamples.cs new file mode 100644 index 00000000..f628e1c8 --- /dev/null +++ b/csharp/Samples/Microsoft.Spark.CSharp/SparkSessionSamples.cs @@ -0,0 +1,189 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Sql; +using NUnit.Framework; + +namespace Microsoft.Spark.CSharp.Samples +{ + class SparkSessionSamples + { + private static SparkSession sparkSession; + + internal static SparkSession GetSparkSession() + { + return sparkSession ?? (sparkSession = SparkSession.Builder().EnableHiveSupport().GetOrCreate()); + } + + [Sample] + internal static void SSNewSessionSample() + { + RunDataFrameSample(true); + } + + [Sample] + internal static void SSDataFrameSample() + { + RunDataFrameSample(false); + } + + private static void RunDataFrameSample(bool createNewSession) + { + SparkSession ss = GetSparkSession(); + + if (createNewSession) + { + ss = sparkSession.NewSession(); + } + + var peopleDataFrame = ss.Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(DataFrameSamples.PeopleJson)); + var count = peopleDataFrame.Count(); + Console.WriteLine("Count of items in DataFrame {0}", count); + + var sortedDataFrame = peopleDataFrame.Sort(new string[] { "name", "age" }, new bool[] { true, false }); + + sortedDataFrame.Show(); + + if (SparkCLRSamples.Configuration.IsValidationEnabled) + { + var sortedDF = sortedDataFrame.Collect().ToArray(); + Assert.AreEqual("789", sortedDF[0].GetAs("id")); + Assert.AreEqual("123", sortedDF[1].GetAs("id")); + Assert.AreEqual("531", sortedDF[2].GetAs("id")); + Assert.AreEqual("456", sortedDF[3].GetAs("id")); + } + } + + [Sample] + internal static void SSShowSchemaSample() + { + var peopleDataFrame = GetSparkSession().Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(DataFrameSamples.PeopleJson)); + peopleDataFrame.Explain(true); + peopleDataFrame.ShowSchema(); + } + + [Sample] + internal static void SSTableSample() + { + var originalPeopleDataFrame = GetSparkSession().Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(DataFrameSamples.PeopleJson)); + originalPeopleDataFrame.RegisterTempTable("people"); + + var peopleDataFrame = GetSparkSession().Table("people"); + + var projectedFilteredDataFrame = peopleDataFrame.Select("name", "address.state") + .Where("name = 'Bill' or state = 'California'"); + + projectedFilteredDataFrame.ShowSchema(); + projectedFilteredDataFrame.Show(); + + if (SparkCLRSamples.Configuration.IsValidationEnabled) + { + CollectionAssert.AreEqual(new[] { "name", "state" }, projectedFilteredDataFrame.Schema.Fields.Select(f => f.Name).ToArray()); + Assert.IsTrue(projectedFilteredDataFrame.Collect().All(row => row.Get("name") == "Bill" || row.Get("state") == "California")); + } + } + + [Sample] + internal static void SSSqlSample() + { + var originalPeopleDataFrame = GetSparkSession().Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(DataFrameSamples.PeopleJson)); + originalPeopleDataFrame.RegisterTempTable("people"); + + var nameFilteredDataFrame = GetSparkSession().Sql("SELECT name, address.city, address.state FROM people where name='Bill'"); + var countDataFrame = GetSparkSession().Sql("SELECT count(name) FROM people where name='Bill'"); + var maxAgeDataFrame = GetSparkSession().Sql("SELECT max(age) FROM people where name='Bill'"); + long maxAgeDataFrameRowsCount = maxAgeDataFrame.Count(); + long nameFilteredDataFrameRowsCount = nameFilteredDataFrame.Count(); + long countDataFrameRowsCount = countDataFrame.Count(); + Console.WriteLine("nameFilteredDataFrameRowsCount={0}, maxAgeDataFrameRowsCount={1}, countDataFrameRowsCount={2}", nameFilteredDataFrameRowsCount, maxAgeDataFrameRowsCount, countDataFrameRowsCount); + + if (SparkCLRSamples.Configuration.IsValidationEnabled) + { + Assert.AreEqual(1, maxAgeDataFrameRowsCount); + Assert.AreEqual(2, nameFilteredDataFrameRowsCount); + Assert.AreEqual(1, countDataFrameRowsCount); + } + } + + [Sample] + internal static void SSDropTableSample() + { + var originalPeopleDataFrame = GetSparkSession().Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(DataFrameSamples.PeopleJson)); + originalPeopleDataFrame.RegisterTempTable("people"); + + var nameFilteredDataFrame = GetSparkSession().Sql("SELECT name, address.city, address.state FROM people where name='Bill'"); + long nameFilteredDataFrameRowsCount = nameFilteredDataFrame.Count(); + + GetSparkSession().Catalog.DropTempView("people"); + + if (SparkCLRSamples.Configuration.IsValidationEnabled) + { + bool tableMissing = false; + try + { + //parsing would fail + var nameFilteredDataFrame2 = GetSparkSession().Sql("SELECT name, address.city, address.state FROM people where name='Bill'"); + } + catch (Exception) + { + tableMissing = true; + } + + Assert.True(tableMissing); + } + } + + [Sample] + internal static void SSCreateDataFrameSample() + { + var schemaPeople = new StructType(new List + { + new StructField("id", new StringType()), + new StructField("name", new StringType()), + new StructField("age", new IntegerType()), + new StructField("address", new StructType(new List + { + new StructField("city", new StringType()), + new StructField("state", new StringType()) + })), + new StructField("phone numbers", new ArrayType(new StringType())) + }); + + var rddPeople = SparkCLRSamples.SparkContext.Parallelize( + new List + { + new object[] { "123", "Bill", 43, new object[]{ "Columbus", "Ohio" }, new string[]{ "Tel1", "Tel2" } }, + new object[] { "456", "Steve", 34, new object[]{ "Seattle", "Washington" }, new string[]{ "Tel3", "Tel4" } } + }); + + var dataFramePeople = GetSparkSession().CreateDataFrame(rddPeople, schemaPeople); + Console.WriteLine("------ Schema of People Data Frame:\r\n"); + dataFramePeople.ShowSchema(); + Console.WriteLine(); + var collected = dataFramePeople.Collect().ToArray(); + foreach (var people in collected) + { + string id = people.Get("id"); + string name = people.Get("name"); + int age = people.Get("age"); + Row address = people.Get("address"); + string city = address.Get("city"); + string state = address.Get("state"); + object[] phoneNumbers = people.Get("phone numbers"); + Console.WriteLine("id:{0}, name:{1}, age:{2}, address:(city:{3},state:{4}), phoneNumbers:[{5},{6}]\r\n", id, name, age, city, state, phoneNumbers[0], phoneNumbers[1]); + } + + if (SparkCLRSamples.Configuration.IsValidationEnabled) + { + Assert.AreEqual(2, dataFramePeople.Rdd.Count()); + Assert.AreEqual(schemaPeople.Json, dataFramePeople.Schema.Json); + } + } + } +} diff --git a/scala/src/main/org/apache/spark/sql/api/csharp/JvmBridgeUtils.scala b/scala/src/main/org/apache/spark/sql/api/csharp/JvmBridgeUtils.scala index 74f3514f..128bff23 100644 --- a/scala/src/main/org/apache/spark/sql/api/csharp/JvmBridgeUtils.scala +++ b/scala/src/main/org/apache/spark/sql/api/csharp/JvmBridgeUtils.scala @@ -6,6 +6,7 @@ package org.apache.spark.sql.api.csharp import java.util +import org.apache.spark.SparkConf import scala.collection.JavaConverters._ /* @@ -18,4 +19,23 @@ object JvmBridgeUtils { def toMutableMap[K, V](map: util.HashMap[K, V]) : Map[K, V] = { map.asScala.toMap } + + def getKeyValuePairAsString(kvp: Tuple2[String, String]) : String = { + return kvp._1 + "=" + kvp._2 + } + + def getKeyValuePairArrayAsString(kvpArray : Array[Tuple2[String, String]]) : String = { + val sb = new StringBuilder + + for(kvp <- kvpArray) { + sb.append(getKeyValuePairAsString(kvp)) + sb.append(";") + } + + sb.toString + } + + def getSparkConfAsString(sparkConf: SparkConf): String = { + getKeyValuePairArrayAsString(sparkConf.getAll) + } } diff --git a/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala b/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala index e1aad772..f13d0087 100644 --- a/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala +++ b/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala @@ -25,13 +25,12 @@ import org.apache.spark.broadcast.Broadcast * implementation constructs from SparkR */ object SQLUtils { - def createSQLContext(sc: SparkContext): SQLContext = { - new SQLContext(sc) + def createSparkSession(sc: SparkContext): SparkSession = { + new SparkSession(sc) } - def createHiveContext(sc: SparkContext): SQLContext = { - // TODO fix this - new SQLContext(sc) + def getSqlContext(ss: SparkSession): SQLContext = { + ss.sqlContext } def getJavaSparkContext(sqlCtx: SQLContext): JavaSparkContext = { diff --git a/scala/src/test/scala/org/apache/spark/util/csharp/JvmBridgeUtilsSuite.scala b/scala/src/test/scala/org/apache/spark/util/csharp/JvmBridgeUtilsSuite.scala new file mode 100644 index 00000000..faef7e69 --- /dev/null +++ b/scala/src/test/scala/org/apache/spark/util/csharp/JvmBridgeUtilsSuite.scala @@ -0,0 +1,31 @@ +/* + * Copyright (c) Microsoft. All rights reserved. + * Licensed under the MIT license. See LICENSE file in the project root for full license information. + */ + +package org.apache.spark.util.csharp + +import org.apache.spark.SparkConf +import org.apache.spark.csharp.SparkCLRFunSuite +import org.apache.spark.sql.api.csharp.JvmBridgeUtils + + +class JvmBridgeUtilsSuite extends SparkCLRFunSuite{ + test("getSparkConfAsString") { + var sparkConf = new SparkConf(true) + val appName = "appName" + sparkConf.setAppName(appName) + val master = "master" + sparkConf.setMaster(master) + val kvp1 = ("spark.config1.name", "config1.value") + sparkConf.set(kvp1._1, kvp1._2) + val kvp2 = ("spark.config2.name", "config2.value") + sparkConf.set(kvp2._1, kvp2._2) + + val returnValue = JvmBridgeUtils.getSparkConfAsString(sparkConf) + assert(returnValue.contains(s"spark.master=${master}")) + assert(returnValue.contains(s"spark.app.name=${appName}")) + assert(returnValue.contains(s"${kvp1._1}=${kvp1._2}")) + assert(returnValue.contains(s"${kvp2._1}=${kvp2._2}")) + } +} \ No newline at end of file From 9a37ec84fde9882f5ac1c7792025c73877e5849e Mon Sep 17 00:00:00 2001 From: Yun Tang Date: Sat, 10 Sep 2016 05:20:34 +0800 Subject: [PATCH 05/15] Use logInfo and logError method instead of println (#556) * Use logInfo and logError method insted of println Finish TODO "logError does not work now" * remove redundant printStackTrace method --- .../spark/api/csharp/CSharpBackend.scala | 16 ++++--- .../api/csharp/CSharpBackendHandler.scala | 46 ++++++------------- .../apache/spark/api/csharp/CSharpRDD.scala | 2 +- .../spark/deploy/csharp/CSharpRunner.scala | 28 ++++++----- .../streaming/api/csharp/CSharpDStream.scala | 8 ++-- 5 files changed, 40 insertions(+), 60 deletions(-) diff --git a/scala/src/main/org/apache/spark/api/csharp/CSharpBackend.scala b/scala/src/main/org/apache/spark/api/csharp/CSharpBackend.scala index 2cdbc8d2..9fae5f7f 100644 --- a/scala/src/main/org/apache/spark/api/csharp/CSharpBackend.scala +++ b/scala/src/main/org/apache/spark/api/csharp/CSharpBackend.scala @@ -7,15 +7,16 @@ package org.apache.spark.api.csharp import java.io.{DataOutputStream, File, FileOutputStream, IOException} import java.net.{InetAddress, InetSocketAddress, ServerSocket, Socket} -import java.util.concurrent.{LinkedBlockingQueue, BlockingQueue, TimeUnit} +import java.util.concurrent.{BlockingQueue, LinkedBlockingQueue, TimeUnit} import io.netty.bootstrap.ServerBootstrap import io.netty.channel.nio.NioEventLoopGroup import io.netty.channel.socket.SocketChannel import io.netty.channel.socket.nio.NioServerSocketChannel -import io.netty.channel.{ChannelInitializer, EventLoopGroup, ChannelFuture} +import io.netty.channel.{ChannelFuture, ChannelInitializer, EventLoopGroup} import io.netty.handler.codec.LengthFieldBasedFrameDecoder import io.netty.handler.codec.bytes.{ByteArrayDecoder, ByteArrayEncoder} +import org.apache.spark.internal.Logging /** @@ -24,9 +25,10 @@ import io.netty.handler.codec.bytes.{ByteArrayDecoder, ByteArrayEncoder} * This implementation is identical to RBackend and that can be reused * in SparkCLR if the handler is made pluggable */ -// Since SparkCLR is a package to Spark and not a part of spark-core it mirrors the implementation of -// selected parts from RBackend with SparkCLR customizations -class CSharpBackend { self => // for accessing the this reference in inner class(ChannelInitializer) +// Since SparkCLR is a package to Spark and not a part of spark-core it mirrors the implementation +// of selected parts from RBackend with SparkCLR customizations +class CSharpBackend extends Logging +{ self => // for accessing the this reference in inner class(ChannelInitializer) private[this] var channelFuture: ChannelFuture = null private[this] var bootstrap: ServerBootstrap = null private[this] var bossGroup: EventLoopGroup = null @@ -82,7 +84,7 @@ class CSharpBackend { self => // for accessing the this reference in inner class bootstrap = null // Send close to CSharp callback server. - println("Requesting to close all call back sockets.") + logInfo("Requesting to close all call back sockets.") var socket: Socket = null do { socket = CSharpBackend.callbackSockets.poll() @@ -94,7 +96,7 @@ class CSharpBackend { self => // for accessing the this reference in inner class socket = null } catch { - case e : Exception => println("Exception when closing socket: " + e) + case e : Exception => logError("Exception when closing socket: ", e) } } } while (socket != null) diff --git a/scala/src/main/org/apache/spark/api/csharp/CSharpBackendHandler.scala b/scala/src/main/org/apache/spark/api/csharp/CSharpBackendHandler.scala index 9fd9fd92..20fd3816 100644 --- a/scala/src/main/org/apache/spark/api/csharp/CSharpBackendHandler.scala +++ b/scala/src/main/org/apache/spark/api/csharp/CSharpBackendHandler.scala @@ -6,11 +6,12 @@ package org.apache.spark.api.csharp import org.apache.spark.util.Utils -import java.io.{DataOutputStream, ByteArrayOutputStream, DataInputStream, ByteArrayInputStream} +import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} import java.net.Socket import io.netty.channel.ChannelHandler.Sharable import io.netty.channel.{ChannelHandlerContext, SimpleChannelInboundHandler} +import org.apache.spark.internal.Logging // TODO - work with SparkR devs to make this configurable and reuse RBackendHandler import org.apache.spark.api.csharp.SerDe._ @@ -24,7 +25,8 @@ import scala.collection.mutable.HashMap */ // Since SparkCLR is a package to Spark and not a part of spark-core, it mirrors the implementation // of selected parts from RBackend with SparkCLR customizations -class CSharpBackendHandler(server: CSharpBackend) extends SimpleChannelInboundHandler[Array[Byte]] { +class CSharpBackendHandler(server: CSharpBackend) extends SimpleChannelInboundHandler[Array[Byte]] + with Logging{ override def channelRead0(ctx: ChannelHandlerContext, msg: Array[Byte]): Unit = { val reply = handleBackendRequest(msg) @@ -71,15 +73,13 @@ class CSharpBackendHandler(server: CSharpBackend) extends SimpleChannelInboundHa val t = readObjectType(dis) assert(t == 'i') val port = readInt(dis) - // scalastyle:off println - println("[CSharpBackendHandler] Connecting to a callback server at port " + port) + logInfo(s"Connecting to a callback server at port $port") CSharpBackend.callbackPort = port writeInt(dos, 0) writeType(dos, "void") case "closeCallback" => // Send close to CSharp callback server. - println("[CSharpBackendHandler] Requesting to close all call back sockets.") - // scalastyle:on + logInfo("Requesting to close all call back sockets.") var socket: Socket = null do { socket = CSharpBackend.callbackSockets.poll() @@ -91,7 +91,7 @@ class CSharpBackendHandler(server: CSharpBackend) extends SimpleChannelInboundHa socket = null } catch { - case e: Exception => println("Exception when closing socket: " + e) + case e: Exception => logError("Exception when closing socket: ", e) } } } while (socket != null) @@ -111,10 +111,7 @@ class CSharpBackendHandler(server: CSharpBackend) extends SimpleChannelInboundHa override def exceptionCaught(ctx: ChannelHandlerContext, cause: Throwable): Unit = { // Close the connection when an exception is raised. - // scalastyle:off println - println("Exception caught: " + cause.getMessage) - // scalastyle:on - cause.printStackTrace() + logError("Exception caught: ", cause) ctx.close() } @@ -176,31 +173,26 @@ class CSharpBackendHandler(server: CSharpBackend) extends SimpleChannelInboundHa } } catch { case e: Exception => - // TODO - logError does not work now..fix //logError(s"$methodName on $objId failed", e) val jvmObj = JVMObjectTracker.get(objId) val jvmObjName = jvmObj match { case Some(jObj) => jObj.getClass.getName case None => "NullObject" } - // scalastyle:off println - println(s"[CSharpBackendHandler] $methodName on object of type $jvmObjName failed") - println(e.getMessage) - println(e.printStackTrace()) + logError(s"On object of type $jvmObjName failed", e) if (methods != null) { - println("methods:") - methods.foreach(println(_)) + logError("methods:") + methods.foreach(m => logError(m.toString)) } if (args != null) { - println("args:") + logError("args:") args.foreach(arg => { if (arg != null) { - println("argType: " + arg.getClass.getCanonicalName + ", argValue: " + arg) + logError(s"argType: ${arg.getClass.getCanonicalName}, argValue: $arg") } else { - println("arg: NULL") + logError("arg: NULL") } }) } - // scalastyle:on println writeInt(dos, -1) writeString(dos, Utils.exceptionString(e.getCause)) } @@ -254,16 +246,6 @@ class CSharpBackendHandler(server: CSharpBackend) extends SimpleChannelInboundHa true } - // scalastyle:off println - def logError(id: String) { - println(id) - } - - def logWarning(id: String) { - println(id) - } - - // scalastyle:on println def logError(id: String, e: Exception): Unit = { diff --git a/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala b/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala index 3ecd8969..1faf7766 100644 --- a/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala +++ b/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala @@ -181,7 +181,7 @@ class CSharpRDD( case e: OverlappingFileLockException => logInfo("Already obtained the lock.") waitUnzipOperationDone(doneFlag) - case e: Exception => e.printStackTrace() + case e: Exception => logError("Exception when unzipping cSharpWorkerWorkingDir", e) } finally { if (lock != null && lock.isValid) lock.release() diff --git a/scala/src/main/org/apache/spark/deploy/csharp/CSharpRunner.scala b/scala/src/main/org/apache/spark/deploy/csharp/CSharpRunner.scala index ddd72351..63986760 100644 --- a/scala/src/main/org/apache/spark/deploy/csharp/CSharpRunner.scala +++ b/scala/src/main/org/apache/spark/deploy/csharp/CSharpRunner.scala @@ -14,6 +14,7 @@ import org.apache.spark.SparkConf import org.apache.spark.SecurityManager import org.apache.spark.api.csharp.CSharpBackend import org.apache.spark.deploy.{PythonRunner, SparkHadoopUtil, SparkSubmitArguments} +import org.apache.spark.internal.Logging import org.apache.spark.util.{RedirectThread, Utils} import org.apache.spark.util.csharp.{Utils => CSharpSparkUtils} @@ -22,8 +23,7 @@ import org.apache.spark.util.csharp.{Utils => CSharpSparkUtils} * gets its port number and launches C# process passing the port number to it. * The runner implementation is mostly identical to RRunner with SparkCLR-specific customizations. */ -// scalastyle:off println -object CSharpRunner { +object CSharpRunner extends Logging{ val MOBIUS_DEBUG_PORT = 5567 def main(args: Array[String]): Unit = { @@ -51,7 +51,7 @@ object CSharpRunner { zipFileName = downloadDriverFile(zipFileName, driverDir.getAbsolutePath).getName } - println(s"[CSharpRunner.main] Unzipping driver $zipFileName in $driverDir") + logInfo(s"Unzipping driver $zipFileName in $driverDir") CSharpSparkUtils.unzip(new File(zipFileName), driverDir) // reusing windows-specific formatting in PythonRunner csharpExecutable = PythonRunner.formatPath(args(1)) @@ -74,7 +74,7 @@ object CSharpRunner { processParameters.add(formatPath(csharpExecutable)) otherArgs.foreach( arg => processParameters.add(arg) ) - println("[CSharpRunner.main] Starting CSharpBackend!") + logInfo("Starting CSharpBackend!") // Time to wait for CSharpBackend to initialize in seconds val backendTimeout = sys.env.getOrElse("CSHARPBACKEND_TIMEOUT", "120").toInt @@ -88,8 +88,7 @@ object CSharpRunner { // need to get back csharpBackendPortNumber because if the value passed to init is 0 // the port number is dynamically assigned in the backend csharpBackendPortNumber = csharpBackend.init(csharpBackendPortNumber) - println("[CSharpRunner.main] Port number used by CSharpBackend is " - + csharpBackendPortNumber) // TODO - send to logger also + logInfo(s"Port number used by CSharpBackend is $csharpBackendPortNumber") initialized.release() csharpBackend.run() } @@ -107,8 +106,7 @@ object CSharpRunner { for ((key, value) <- Utils.getSystemProperties if key.startsWith("spark.")) { env.put(key, value) - println("[CSharpRunner.main] adding key=" + key - + " and value=" + value + " to environment") + logInfo(s"Adding key=$key and value=$value to environment") } builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize val process = builder.start() @@ -123,22 +121,23 @@ object CSharpRunner { closeBackend(csharpBackend) } catch { case t: Throwable => - println("[CSharpRunner.main]" + t.getMessage + "\n" + t.getStackTrace) + logError(s"${t.getMessage} \n ${t.getStackTrace}") } - println("[CSharpRunner.main] Return CSharpBackend code " + returnCode) + logInfo(s"Return CSharpBackend code $returnCode") CSharpSparkUtils.exit(returnCode) } else { + // scalastyle:off println println("***********************************************************************") println("* [CSharpRunner.main] Backend running debug mode. Press enter to exit *") println("***********************************************************************") + // scalastyle:on println Console.readLine() closeBackend(csharpBackend) CSharpSparkUtils.exit(0) } } else { - println("[CSharpRunner.main] CSharpBackend did not initialize in " - + backendTimeout + " seconds") + logError(s"CSharpBackend did not initialize in $backendTimeout seconds") CSharpSparkUtils.exit(-1) } } @@ -168,7 +167,7 @@ object CSharpRunner { val localFile = new File(driverDir, jarFileName) if (!localFile.exists()) { // May already exist if running multiple workers on one node - println(s"Copying user file $filePath to $driverDir") + logInfo(s"Copying user file $filePath to $driverDir") Utils.fetchFile( hdfsFilePath, new File(driverDir), @@ -187,7 +186,7 @@ object CSharpRunner { } def closeBackend(csharpBackend: CSharpBackend): Unit = { - println("[CSharpRunner.main] closing CSharpBackend") + logInfo("Closing CSharpBackend") csharpBackend.close() } @@ -205,4 +204,3 @@ object CSharpRunner { (runInDebugMode, portNumber) } } -// scalastyle:on println diff --git a/scala/src/main/org/apache/spark/streaming/api/csharp/CSharpDStream.scala b/scala/src/main/org/apache/spark/streaming/api/csharp/CSharpDStream.scala index 93d5e58c..1d87e539 100644 --- a/scala/src/main/org/apache/spark/streaming/api/csharp/CSharpDStream.scala +++ b/scala/src/main/org/apache/spark/streaming/api/csharp/CSharpDStream.scala @@ -29,7 +29,7 @@ import org.apache.spark.streaming.api.java._ import scala.language.existentials -object CSharpDStream { +object CSharpDStream extends Logging{ // Variables for debugging var debugMode = false @@ -78,9 +78,7 @@ object CSharpDStream { case e: Exception => // log exception only when callback socket is not shutdown explicitly if (!CSharpBackend.callbackSocketShutdown) { - // TODO: change println to log - System.err.println("CSharp transform callback failed with " + e) // scalastyle:off println - e.printStackTrace() + logError(s"CSharp transform callback failed", e) } // close this socket if error happen @@ -89,7 +87,7 @@ object CSharpDStream { socket.close() } catch { - case e: Exception => println("Exception when closing socket: " + e) + case e: Exception => logError("Exception when closing socket", e) } } From 67046cef3e1d642ac6402f28f111490518230c0e Mon Sep 17 00:00:00 2001 From: dwnichols Date: Fri, 16 Sep 2016 14:13:48 -0400 Subject: [PATCH 06/15] Update run-samples.sh to use unix path separators --- build/localmode/run-samples.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build/localmode/run-samples.sh b/build/localmode/run-samples.sh index 5c6486bb..37846c51 100755 --- a/build/localmode/run-samples.sh +++ b/build/localmode/run-samples.sh @@ -73,9 +73,9 @@ fi export SPARKCLR_HOME="$FWDIR/../runtime" # spark-csv package and its depenedency are required for DataFrame operations in Mobius -export SPARKCLR_EXT_PATH="$SPARKCLR_HOME\dependencies" -export SPARKCSV_JAR1PATH="$SPARKCLR_EXT_PATH\spark-csv_2.10-1.3.0.jar" -export SPARKCSV_JAR2PATH="$SPARKCLR_EXT_PATH\commons-csv-1.1.jar" +export SPARKCLR_EXT_PATH="$SPARKCLR_HOME/dependencies" +export SPARKCSV_JAR1PATH="$SPARKCLR_EXT_PATH/spark-csv_2.10-1.3.0.jar" +export SPARKCSV_JAR2PATH="$SPARKCLR_EXT_PATH/commons-csv-1.1.jar" export SPARKCLR_EXT_JARS="$SPARKCSV_JAR1PATH,$SPARKCSV_JAR2PATH" # run-samples.sh is in local mode, should not load Hadoop or Yarn cluster config. Disable Hadoop/Yarn conf dir. From 427ce52e8df910b6775d628a7fc2cfc411d5b8c5 Mon Sep 17 00:00:00 2001 From: dwnichols Date: Fri, 16 Sep 2016 16:32:06 -0400 Subject: [PATCH 07/15] Download external dependencies from the build.sh shell script --- build/build.sh | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/build/build.sh b/build/build.sh index 5e935cd5..25ada051 100755 --- a/build/build.sh +++ b/build/build.sh @@ -22,6 +22,41 @@ fi [ ! -d "$SPARKCLR_HOME/lib" ] && mkdir "$SPARKCLR_HOME/lib" [ ! -d "$SPARKCLR_HOME/samples" ] && mkdir "$SPARKCLR_HOME/samples" [ ! -d "$SPARKCLR_HOME/scripts" ] && mkdir "$SPARKCLR_HOME/scripts" +[ ! -d "$SPARKCLR_HOME/dependencies" ] && mkdir "$SPARKCLR_HOME/dependencies" + +echo "Download Mobius external dependencies" +pushd "$SPARKCLR_HOME/dependencies" + +download_dependency() { + LINK=$1 + JAR=$2 + + if [ ! -e $JAR ]; + then + wget $LINK -O $JAR + + if [ ! -e $JAR ]; + then + echo "Cannot download external dependency $JAR from $LINK" + popd + exit 1 + fi + fi +} + +SPARK_CSV_LINK="http://search.maven.org/remotecontent?filepath=com/databricks/spark-csv_2.10/1.3.0/spark-csv_2.10-1.3.0.jar" +SPARK_CSV_JAR="spark-csv_2.10-1.3.0.jar" +download_dependency $SPARK_CSV_LINK $SPARK_CSV_JAR + +COMMONS_CSV_LINK="http://search.maven.org/remotecontent?filepath=org/apache/commons/commons-csv/1.1/commons-csv-1.1.jar" +COMMONS_CSV_JAR="commons-csv-1.1.jar" +download_dependency $COMMONS_CSV_LINK $COMMONS_CSV_JAR + +SPARK_STREAMING_KAFKA_LINK="http://search.maven.org/remotecontent?filepath=org/apache/spark/spark-streaming-kafka-0-8-assembly_2.11/2.0.0/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar" +SPARK_STREAMING_KAFKA_JAR="spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar" +download_dependency $SPARK_STREAMING_KAFKA_LINK $SPARK_STREAMING_KAFKA_JAR + +popd echo "Assemble Mobius Scala components" pushd "$FWDIR/../scala" @@ -36,7 +71,7 @@ mvn clean -q # build the package mvn package -Puber-jar -q -if [ $? -ne 0 ] +if [ $? -ne 0 ]; then echo "Build Mobius Scala components failed, stop building." popd From 3da5beb91ed2d4dbb83addae6b334ce43949f5ab Mon Sep 17 00:00:00 2001 From: dwnichols Date: Fri, 16 Sep 2016 18:21:19 -0400 Subject: [PATCH 08/15] Store external dependencies in build before copying to runtime folder --- build/build.sh | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/build/build.sh b/build/build.sh index 25ada051..6b83d359 100755 --- a/build/build.sh +++ b/build/build.sh @@ -7,25 +7,10 @@ export FWDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -export SPARKCLR_HOME="$FWDIR/runtime" -echo "SPARKCLR_HOME=$SPARKCLR_HOME" - -if [ -d "$SPARKCLR_HOME" ]; -then - echo "Delete existing $SPARKCLR_HOME ..." - rm -r -f "$SPARKCLR_HOME" -fi - -[ ! -d "$SPARKCLR_HOME" ] && mkdir "$SPARKCLR_HOME" -[ ! -d "$SPARKCLR_HOME/bin" ] && mkdir "$SPARKCLR_HOME/bin" -[ ! -d "$SPARKCLR_HOME/data" ] && mkdir "$SPARKCLR_HOME/data" -[ ! -d "$SPARKCLR_HOME/lib" ] && mkdir "$SPARKCLR_HOME/lib" -[ ! -d "$SPARKCLR_HOME/samples" ] && mkdir "$SPARKCLR_HOME/samples" -[ ! -d "$SPARKCLR_HOME/scripts" ] && mkdir "$SPARKCLR_HOME/scripts" -[ ! -d "$SPARKCLR_HOME/dependencies" ] && mkdir "$SPARKCLR_HOME/dependencies" +[ ! -d "$FWDIR/dependencies" ] && mkdir "$FWDIR/dependencies" echo "Download Mobius external dependencies" -pushd "$SPARKCLR_HOME/dependencies" +pushd "$FWDIR/dependencies" download_dependency() { LINK=$1 @@ -33,7 +18,8 @@ download_dependency() { if [ ! -e $JAR ]; then - wget $LINK -O $JAR + echo "Downloading $JAR" + wget -q $LINK -O $JAR if [ ! -e $JAR ]; then @@ -58,6 +44,27 @@ download_dependency $SPARK_STREAMING_KAFKA_LINK $SPARK_STREAMING_KAFKA_JAR popd +export SPARKCLR_HOME="$FWDIR/runtime" +echo "SPARKCLR_HOME=$SPARKCLR_HOME" + +if [ -d "$SPARKCLR_HOME" ]; +then + echo "Delete existing $SPARKCLR_HOME ..." + rm -r -f "$SPARKCLR_HOME" +fi + +[ ! -d "$SPARKCLR_HOME" ] && mkdir "$SPARKCLR_HOME" +[ ! -d "$SPARKCLR_HOME/bin" ] && mkdir "$SPARKCLR_HOME/bin" +[ ! -d "$SPARKCLR_HOME/data" ] && mkdir "$SPARKCLR_HOME/data" +[ ! -d "$SPARKCLR_HOME/lib" ] && mkdir "$SPARKCLR_HOME/lib" +[ ! -d "$SPARKCLR_HOME/samples" ] && mkdir "$SPARKCLR_HOME/samples" +[ ! -d "$SPARKCLR_HOME/scripts" ] && mkdir "$SPARKCLR_HOME/scripts" +[ ! -d "$SPARKCLR_HOME/dependencies" ] && mkdir "$SPARKCLR_HOME/dependencies" + +echo "Assemble Mobius external dependencies" +cp $FWDIR/dependencies/* "$SPARKCLR_HOME/dependencies/" +[ $? -ne 0 ] && exit 1 + echo "Assemble Mobius Scala components" pushd "$FWDIR/../scala" From 3c767bddd44229fbee95ff8b39f7323fb701c2d3 Mon Sep 17 00:00:00 2001 From: Quanmao LIU Date: Tue, 20 Sep 2016 13:41:29 +0800 Subject: [PATCH 09/15] Add Python version performance benchmark test; Add usage + example and RIOSocket option support for C# version benchmark; Update Scala version benchmark; Update csv package version. (#565) Besides normal spark-submit , there's a convenient way using https://github.com/qualiu/testMobius 1.Cluster mode examples as following: D:\msgit\lqmMobius\testMobius\scripts\perf\submit-python-perf-test.bat d:\msgit\lqmMobius\python\perf 10 hdfs:///perf/data/deletions/* D:\msgit\lqmMobius\testMobius\scripts\perf\submit-scala-perf-test.bat d:\msgit\lqmMobius\scala\perf 10 D:\msgit\lqmMobius\testMobius\scripts\perf\submit-mobius-perf-test.bat d:\msgit\lqmMobius\csharp\perf 10 2.Local mode examples : (1) First set SparkOptions : just copy the "Local Mode set SparkOptions=" after running the bat without arguments. (2) Then submit local test : (you should have downloaded or copied the test data, like d:\mobius\deletions) D:\msgit\lqmMobius\testMobius\scripts\perf\submit-python-perf-test.bat d:\msgit\lqmMobius\python\perf 3 d:\mobius\deletions\deletions.csv-00000-of-00020 D:\msgit\lqmMobius\testMobius\scripts\perf\submit-scala-perf-test.bat d:\msgit\lqmMobius\scala\perf 3 d:\mobius\deletions\deletions.csv-00000-of-00020 D:\msgit\lqmMobius\testMobius\scripts\perf\submit-mobius-perf-test.bat d:\msgit\lqmMobius\csharp\perf 3 d:\mobius\deletions\deletions.csv-00000-of-00020 --- .gitignore | 1 + build/build.sh | 8 +- build/localmode/RunSamples.cmd | 4 +- build/localmode/downloadtools.ps1 | 8 +- build/localmode/run-samples.sh | 4 +- .../FreebaseDeletionsBenchmark.cs | 20 ++-- .../PerfBenchmark.csproj | 15 +++ csharp/Perf/Microsoft.Spark.CSharp/Program.cs | 32 +++++-- python/perf/FreebaseDeletionsBenchmark.py | 96 +++++++++++++++++++ python/perf/PerfBenchmark.py | 61 ++++++++++++ python/perf/spark-python-perf.py | 33 +++++++ scala/perf/pom.xml | 2 +- .../csharp/FreebaseDeletionsBenchmark.scala | 30 +++--- .../spark/csharp/PerfBenchmark.scala | 61 +++++++----- scala/pom.xml | 2 +- 15 files changed, 304 insertions(+), 73 deletions(-) create mode 100644 python/perf/FreebaseDeletionsBenchmark.py create mode 100644 python/perf/PerfBenchmark.py create mode 100644 python/perf/spark-python-perf.py diff --git a/.gitignore b/.gitignore index b42159a0..7354c165 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ *.class *.dll *.exe +*.pyc # Packages # ############ diff --git a/build/build.sh b/build/build.sh index 6b83d359..21713db5 100755 --- a/build/build.sh +++ b/build/build.sh @@ -30,12 +30,12 @@ download_dependency() { fi } -SPARK_CSV_LINK="http://search.maven.org/remotecontent?filepath=com/databricks/spark-csv_2.10/1.3.0/spark-csv_2.10-1.3.0.jar" -SPARK_CSV_JAR="spark-csv_2.10-1.3.0.jar" +SPARK_CSV_LINK="http://search.maven.org/remotecontent?filepath=com/databricks/spark-csv_2.10/1.4.0/spark-csv_2.10-1.4.0.jar" +SPARK_CSV_JAR="spark-csv_2.10-1.4.0.jar" download_dependency $SPARK_CSV_LINK $SPARK_CSV_JAR -COMMONS_CSV_LINK="http://search.maven.org/remotecontent?filepath=org/apache/commons/commons-csv/1.1/commons-csv-1.1.jar" -COMMONS_CSV_JAR="commons-csv-1.1.jar" +COMMONS_CSV_LINK="http://search.maven.org/remotecontent?filepath=org/apache/commons/commons-csv/1.4/commons-csv-1.4.jar" +COMMONS_CSV_JAR="commons-csv-1.4.jar" download_dependency $COMMONS_CSV_LINK $COMMONS_CSV_JAR SPARK_STREAMING_KAFKA_LINK="http://search.maven.org/remotecontent?filepath=org/apache/spark/spark-streaming-kafka-0-8-assembly_2.11/2.0.0/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar" diff --git a/build/localmode/RunSamples.cmd b/build/localmode/RunSamples.cmd index 6ad9094c..3b75e857 100644 --- a/build/localmode/RunSamples.cmd +++ b/build/localmode/RunSamples.cmd @@ -68,8 +68,8 @@ set SPARKCLR_HOME=%CMDHOME%\..\runtime @rem spark-csv package and its depenedency are required for DataFrame operations in Mobius set SPARKCLR_EXT_PATH=%SPARKCLR_HOME%\dependencies -set SPARKCSV_JAR1PATH=%SPARKCLR_EXT_PATH%\spark-csv_2.10-1.3.0.jar -set SPARKCSV_JAR2PATH=%SPARKCLR_EXT_PATH%\commons-csv-1.1.jar +set SPARKCSV_JAR1PATH=%SPARKCLR_EXT_PATH%\spark-csv_2.10-1.4.0.jar +set SPARKCSV_JAR2PATH=%SPARKCLR_EXT_PATH%\commons-csv-1.4.jar set SPARKCLR_EXT_JARS=%SPARKCSV_JAR1PATH%,%SPARKCSV_JAR2PATH% @rem RunSamples.cmd is in local mode, should not load Hadoop or Yarn cluster config. Disable Hadoop/Yarn conf dir. diff --git a/build/localmode/downloadtools.ps1 b/build/localmode/downloadtools.ps1 index b71f355d..78d20a91 100644 --- a/build/localmode/downloadtools.ps1 +++ b/build/localmode/downloadtools.ps1 @@ -347,14 +347,14 @@ function Download-ExternalDependencies $readMeStream.WriteLine("------------ Dependencies for CSV parsing in Mobius DataFrame API -----------------------------") # Downloading spark-csv package and its depenency. These packages are required for DataFrame operations in Mobius - $url = "http://search.maven.org/remotecontent?filepath=com/databricks/spark-csv_2.10/1.3.0/spark-csv_2.10-1.3.0.jar" - $output="$scriptDir\..\dependencies\spark-csv_2.10-1.3.0.jar" + $url = "http://search.maven.org/remotecontent?filepath=com/databricks/spark-csv_2.10/1.4.0/spark-csv_2.10-1.4.0.jar" + $output="$scriptDir\..\dependencies\spark-csv_2.10-1.4.0.jar" Download-File $url $output Write-Output "[downloadtools.Download-ExternalDependencies] Downloading $url to $scriptDir\..\dependencies" $readMeStream.WriteLine("$url") - $url = "http://search.maven.org/remotecontent?filepath=org/apache/commons/commons-csv/1.1/commons-csv-1.1.jar" - $output="$scriptDir\..\dependencies\commons-csv-1.1.jar" + $url = "http://search.maven.org/remotecontent?filepath=org/apache/commons/commons-csv/1.4/commons-csv-1.4.jar" + $output="$scriptDir\..\dependencies\commons-csv-1.4.jar" Download-File $url $output Write-Output "[downloadtools.Download-ExternalDependencies] Downloading $url to $scriptDir\..\dependencies" $readMeStream.WriteLine("$url") diff --git a/build/localmode/run-samples.sh b/build/localmode/run-samples.sh index 37846c51..4e94bca2 100755 --- a/build/localmode/run-samples.sh +++ b/build/localmode/run-samples.sh @@ -74,8 +74,8 @@ fi export SPARKCLR_HOME="$FWDIR/../runtime" # spark-csv package and its depenedency are required for DataFrame operations in Mobius export SPARKCLR_EXT_PATH="$SPARKCLR_HOME/dependencies" -export SPARKCSV_JAR1PATH="$SPARKCLR_EXT_PATH/spark-csv_2.10-1.3.0.jar" -export SPARKCSV_JAR2PATH="$SPARKCLR_EXT_PATH/commons-csv-1.1.jar" +export SPARKCSV_JAR1PATH="$SPARKCLR_EXT_PATH/spark-csv_2.10-1.4.0.jar" +export SPARKCSV_JAR2PATH="$SPARKCLR_EXT_PATH/commons-csv-1.4.jar" export SPARKCLR_EXT_JARS="$SPARKCSV_JAR1PATH,$SPARKCSV_JAR2PATH" # run-samples.sh is in local mode, should not load Hadoop or Yarn cluster config. Disable Hadoop/Yarn conf dir. diff --git a/csharp/Perf/Microsoft.Spark.CSharp/FreebaseDeletionsBenchmark.cs b/csharp/Perf/Microsoft.Spark.CSharp/FreebaseDeletionsBenchmark.cs index d4e8d6f8..f000cf98 100644 --- a/csharp/Perf/Microsoft.Spark.CSharp/FreebaseDeletionsBenchmark.cs +++ b/csharp/Perf/Microsoft.Spark.CSharp/FreebaseDeletionsBenchmark.cs @@ -12,7 +12,7 @@ namespace Microsoft.Spark.CSharp.PerfBenchmark /// /// Perf benchmark that users Freebase deletions data /// This data is licensed under CC-BY license (http://creativecommons.org/licenses/by/2.5) - /// Data is available for download at https://developers.google.com/freebase/data) + /// Data is available for downloading : "Freebase Deleted Triples" at https://developers.google.com/freebase /// Data format - CSV, size - 8 GB uncompressed /// Columns in the dataset are /// 1. creation_timestamp (Unix epoch time in milliseconds) @@ -55,7 +55,7 @@ internal static void RunRDDMaxDeletionsByUser(string[] args) var lines = PerfBenchmark.SparkContext.TextFile(filePath); var parsedRows = lines.Map(s => { - var columns = s.Split(new[] {','}); + var columns = s.Split(new[] { ',' }); //data has some bad records - use bool flag to indicate corrupt rows if (columns.Length > 4) @@ -75,7 +75,7 @@ internal static void RunRDDMaxDeletionsByUser(string[] args) else return kvp2; }); - + stopwatch.Stop(); PerfBenchmark.ExecutionTimeList.Add(stopwatch.Elapsed); @@ -101,22 +101,22 @@ internal static void RunDFMaxDeletionsByUser(string[] args) stopwatch.Restart(); var rows = PerfBenchmark.SqlContext.TextFile(args[2]); - var filtered = rows.Filter("C1 = C3"); - var aggregated = filtered.GroupBy("C1").Agg(new Dictionary { { "C1", "count" } }); + var filtered = rows.Filter("_c1 = _c3"); + var aggregated = filtered.GroupBy("_c1").Agg(new Dictionary { { "_c1", "count" } }); aggregated.RegisterTempTable("freebasedeletions"); - var max = PerfBenchmark.SqlContext.Sql("select max(`count(C1)`) from freebasedeletions"); + var max = PerfBenchmark.SqlContext.Sql("select max(`count(_c1)`) from freebasedeletions"); var maxArray = max.Collect(); var maxValue = maxArray.First(); - var maxDeletions = PerfBenchmark.SqlContext.Sql("select * from freebasedeletions where `count(C1)` = " + maxValue.Get(0)); + var maxDeletions = PerfBenchmark.SqlContext.Sql("select * from freebasedeletions where `count(_c1)` = " + maxValue.Get(0)); maxDeletions.Show(); //TODO - add perf suite for subquery stopwatch.Stop(); PerfBenchmark.ExecutionTimeList.Add(stopwatch.Elapsed); Console.WriteLine("User with max deletions & count of deletions is listed above. Time elapsed {0}", stopwatch.Elapsed); - + } - - + + } } diff --git a/csharp/Perf/Microsoft.Spark.CSharp/PerfBenchmark.csproj b/csharp/Perf/Microsoft.Spark.CSharp/PerfBenchmark.csproj index 2718f936..70fc5124 100644 --- a/csharp/Perf/Microsoft.Spark.CSharp/PerfBenchmark.csproj +++ b/csharp/Perf/Microsoft.Spark.CSharp/PerfBenchmark.csproj @@ -11,6 +11,7 @@ SparkCLRPerf v4.5 512 + HasCpp AnyCPU @@ -48,6 +49,16 @@ + + + PreserveNewest + Cpp\Riosock.dll + + + PreserveNewest + Cpp\Riosock.pdb + + {ce999a96-f42b-4e80-b208-709d7f49a77c} @@ -60,6 +71,10 @@ + + copy /y $(ProjectDir)..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\$(ConfigurationName)\CSharpWorker.* $(TargetDir) + cp -uv $(ProjectDir)../../../csharp/Worker/Microsoft.Spark.CSharp/bin/$(ConfigurationName)/CSharpWorker.* $(TargetDir) + diff --git a/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala b/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala index f13d0087..c01d76a7 100644 --- a/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala +++ b/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala @@ -15,8 +15,10 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive import org.apache.spark.sql.types.{DataType, FloatType, StructType} import org.apache.spark.sql._ -import java.util.{List => JList, Map => JMap, ArrayList => JArrayList} +import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} + import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.hive.HiveContext /** * Utility functions for DataFrame in SparkCLR @@ -29,6 +31,12 @@ object SQLUtils { new SparkSession(sc) } + // this method is for back compat with older versions of Spark (1.4, 1.5 & 1.6) + // can be removed once Mobius upgrades to Spark 2.1 + def createHiveContext(sc: SparkContext): SQLContext = { + new HiveContext(sc) + } + def getSqlContext(ss: SparkSession): SQLContext = { ss.sqlContext } From 7d920d12065c68759b1856949e096a2ca8e0486c Mon Sep 17 00:00:00 2001 From: skaarthik Date: Mon, 10 Oct 2016 19:11:55 -0700 Subject: [PATCH 13/15] fixing the issue in YARN clusters introduced during Spark 2.0 upgrade --- .../Configuration/ConfigurationService.cs | 4 +++- scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Configuration/ConfigurationService.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Configuration/ConfigurationService.cs index 1e8abbae..cf630391 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Configuration/ConfigurationService.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Configuration/ConfigurationService.cs @@ -65,7 +65,9 @@ internal ConfigurationService() configuration = new SparkCLRConfiguration(appConfig); runMode = RunMode.CLUSTER; } - else if (sparkMaster.Equals("yarn-client", StringComparison.OrdinalIgnoreCase) || sparkMaster.Equals("yarn-cluster", StringComparison.OrdinalIgnoreCase)) + else if (sparkMaster.Equals("yarn-cluster", StringComparison.OrdinalIgnoreCase) || + sparkMaster.Equals("yarn-client", StringComparison.OrdinalIgnoreCase) || + sparkMaster.Equals("yarn", StringComparison.OrdinalIgnoreCase)) //supported in Spark 2.0 { configuration = new SparkCLRConfiguration(appConfig); runMode = RunMode.YARN; diff --git a/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala b/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala index 1faf7766..d48e9f3b 100644 --- a/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala +++ b/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala @@ -68,7 +68,7 @@ class CSharpRDD( val func = SQLUtils.createCSharpFunction(command, envVars, cSharpIncludes, - cSharpWorkerExecutable, + cSharpWorker.getAbsolutePath, unUsedVersionIdentifier, broadcastVars, accumulator) From e64e342b33295eaa0671e646e4c2498d1b95b85d Mon Sep 17 00:00:00 2001 From: Kaarthik Sivashanmugam Date: Tue, 11 Oct 2016 12:38:06 -0700 Subject: [PATCH 14/15] adding instructions for HDI --- README.md | 2 +- notes/linux-instructions.md | 3 +-- notes/mobius-in-hdinsight.md | 26 ++++++++++++++++++++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 notes/mobius-in-hdinsight.md diff --git a/README.md b/README.md index a1ce8255..55b3d28c 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ Refer to the [docs folder](docs) for design overview and other info on Mobius |Build & run unit tests |[Build in Windows](notes/windows-instructions.md#building-mobius) |[Build in Linux](notes/linux-instructions.md#building-mobius-in-linux) | |Run samples (functional tests) in local mode |[Samples in Windows](notes/windows-instructions.md#running-samples) |[Samples in Linux](notes/linux-instructions.md#running-mobius-samples-in-linux) | |Run examples in local mode |[Examples in Windows](/notes/running-mobius-app.md#running-mobius-examples-in-local-mode) |[Examples in Linux](notes/linux-instructions.md#running-mobius-examples-in-linux) | -|Run Mobius app |
  • [Standalone cluster](notes/running-mobius-app.md#standalone-cluster)
  • [YARN cluster](notes/running-mobius-app.md#yarn-cluster)
|
  • [Linux cluster](notes/linux-instructions.md#running-mobius-applications-in-linux)
  • [Azure HDInsight Spark Cluster](/notes/linux-instructions.md#mobius-in-azure-hdinsight-spark-cluster)
  • [AWS EMR Spark Cluster](/notes/linux-instructions.md#mobius-in-amazon-web-services-emr-spark-cluster)
  • | +|Run Mobius app |
    • [Standalone cluster](notes/running-mobius-app.md#standalone-cluster)
    • [YARN cluster](notes/running-mobius-app.md#yarn-cluster)
    |
    • [Linux cluster](notes/linux-instructions.md#running-mobius-applications-in-linux)
    • [Azure HDInsight Spark Cluster](/notes/mobius-in-hdinsight.md)
    • [AWS EMR Spark Cluster](/notes/linux-instructions.md#mobius-in-amazon-web-services-emr-spark-cluster)
    • | |Run Mobius Shell |
      • [Local](notes/mobius-shell.md#run-shell)
      • [YARN](notes/mobius-shell.md#run-shell)
      | Not supported yet | ### Useful Links diff --git a/notes/linux-instructions.md b/notes/linux-instructions.md index e84f8e56..4a6e1972 100644 --- a/notes/linux-instructions.md +++ b/notes/linux-instructions.md @@ -52,8 +52,7 @@ If you are using CentOS, Fedora, or similar Linux distributions or OS X, follow * Update CSharpWorkerPath setting in Mobius application config (refer to the config files used in Mobius examples like the [config for with Pi example](https://github.com/skaarthik/Mobius/blob/linux/examples/Batch/pi/App.config#L61)) to point to [CSharpWorker.sh.exe](./linux-csharpworker-prefix-script.md) (make sure to set the correct value appropriate for the Spark mode to be used) ### Mobius in Azure HDInsight Spark Cluster -* Mono version available in HDInsight cluster is 3.x. Mobius [requires](./linux-instructions.md#prerequisites) 4.2 or above. So, Mono has to be upgraded in HDInsight cluster to use Mobius. -* Follow [instructions](./linux-instructions.md#requirements) for Ubuntu +* Refer to [instructions](./mobius-in-hdinsight.md) for running Mobius application in HDI ### Mobius in Amazon Web Services EMR Spark Cluster * Follow [instructions](./linux-instructions.md#requirements) for CentOS diff --git a/notes/mobius-in-hdinsight.md b/notes/mobius-in-hdinsight.md new file mode 100644 index 00000000..9c04fb42 --- /dev/null +++ b/notes/mobius-in-hdinsight.md @@ -0,0 +1,26 @@ +#Using Mobius in HDInsight Spark Cluster +Mobius [requires](./linux-instructions.md#prerequisites) Mono version 4.2 or above. Depending on the HDI cluster version, manual upgrade of Mono in head and worker nodes may be required. Refer to the table below for Mono upgrade requirements. + +|HDI Version |Mono Version |Mono Upgrade Required | +|---|:------|:----| +3.4 |3.4 |Yes | +3.5 |4.6.1 |No | + +After ensuring that the correct version of Mono is available in the HDI cluster, [instructions](./linux-instructions.md#requirements) to run Mobius applications in HDI are similiar to that of any Ubuntu-based Spark cluster using YARN. Following steps illustrate how to run Mobius Pi example in HDI. + +``` +# login to head node +# create mobius folder under /home/username +mkdir mobius +cd mobius +# replace the url below with the correct version of Mobius +wget https://github.com/Microsoft/Mobius/releases/download/v2.0.000-PREVIEW-2/spark-clr_2.11-2.0.000-PREVIEW-2.zip +unzip spark-clr_2.11-2.0.000-PREVIEW-2.zip +export SPARKCLR_HOME=/home/username/mobius/runtime +cd runtime/scripts +chmod +x sparkclr-submit.sh +# make sure Mobius app has executable permissions +chmod +x ../../examples/Batch/pi/SparkClrPi.exe +# deploy mode can be client or cluster +./sparkclr-submit.sh --master yarn --deploy-mode client --exe SparkClrPi.exe /home/username/mobius/examples/Batch/pi +``` From 9a8fbe811d3556a4454c1b4544ed32a74f9e69d4 Mon Sep 17 00:00:00 2001 From: Kaarthik Sivashanmugam Date: Tue, 11 Oct 2016 12:38:22 -0700 Subject: [PATCH 15/15] updating AssemblyInfo versions --- .../Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs | 5 +++-- csharp/AdapterTest/Properties/AssemblyInfo.cs | 4 ++-- .../Perf/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs | 4 ++-- csharp/Repl/Properties/AssemblyInfo.cs | 4 ++-- csharp/ReplTest/Properties/AssemblyInfo.cs | 4 ++-- .../Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs | 4 ++-- csharp/Tests.Common/Properties/AssemblyInfo.cs | 4 ++-- .../Utils/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs | 4 ++-- .../Worker/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs | 4 ++-- csharp/WorkerTest/Properties/AssemblyInfo.cs | 4 ++-- 10 files changed, 21 insertions(+), 20 deletions(-) diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs index b2c4cfc0..5f6b677b 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs @@ -30,5 +30,6 @@ // Build Number // Revision // -[assembly: AssemblyVersion("1.6.1.0")] -[assembly: AssemblyFileVersion("1.6.1.0")] +[assembly: AssemblyVersion("2.0")] +[assembly: AssemblyFileVersion("2.0")] + diff --git a/csharp/AdapterTest/Properties/AssemblyInfo.cs b/csharp/AdapterTest/Properties/AssemblyInfo.cs index c9fd48f6..91a4e560 100644 --- a/csharp/AdapterTest/Properties/AssemblyInfo.cs +++ b/csharp/AdapterTest/Properties/AssemblyInfo.cs @@ -30,5 +30,5 @@ // Build Number // Revision // -[assembly: AssemblyVersion("1.6.1.0")] -[assembly: AssemblyFileVersion("1.6.1.0")] +[assembly: AssemblyVersion("2.0")] +[assembly: AssemblyFileVersion("2.0")] \ No newline at end of file diff --git a/csharp/Perf/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs b/csharp/Perf/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs index 7df7032c..b6e947d8 100644 --- a/csharp/Perf/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs +++ b/csharp/Perf/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs @@ -34,5 +34,5 @@ // You can specify all the values or you can default the Build and Revision Numbers // by using the '*' as shown below: // [assembly: AssemblyVersion("1.0.*")] -[assembly: AssemblyVersion("1.6.1.0")] -[assembly: AssemblyFileVersion("1.6.1.0")] +[assembly: AssemblyVersion("2.0")] +[assembly: AssemblyFileVersion("2.0")] diff --git a/csharp/Repl/Properties/AssemblyInfo.cs b/csharp/Repl/Properties/AssemblyInfo.cs index 154b50f0..9efe4ae6 100644 --- a/csharp/Repl/Properties/AssemblyInfo.cs +++ b/csharp/Repl/Properties/AssemblyInfo.cs @@ -35,5 +35,5 @@ // You can specify all the values or you can default the Build and Revision Numbers // by using the '*' as shown below: // [assembly: AssemblyVersion("1.0.*")] -[assembly: AssemblyVersion("1.6.1.0")] -[assembly: AssemblyFileVersion("1.6.1.0")] +[assembly: AssemblyVersion("2.0")] +[assembly: AssemblyFileVersion("2.0")] diff --git a/csharp/ReplTest/Properties/AssemblyInfo.cs b/csharp/ReplTest/Properties/AssemblyInfo.cs index 261bcb13..72a12cd7 100644 --- a/csharp/ReplTest/Properties/AssemblyInfo.cs +++ b/csharp/ReplTest/Properties/AssemblyInfo.cs @@ -35,5 +35,5 @@ // You can specify all the values or you can default the Build and Revision Numbers // by using the '*' as shown below: // [assembly: AssemblyVersion("1.0.*")] -[assembly: AssemblyVersion("1.6.1.0")] -[assembly: AssemblyFileVersion("1.6.1.0")] +[assembly: AssemblyVersion("2.0")] +[assembly: AssemblyFileVersion("2.0")] diff --git a/csharp/Samples/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs b/csharp/Samples/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs index ade554ff..06379a7e 100644 --- a/csharp/Samples/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs +++ b/csharp/Samples/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs @@ -30,5 +30,5 @@ // Build Number // Revision // -[assembly: AssemblyVersion("1.6.1.0")] -[assembly: AssemblyFileVersion("1.6.1.0")] +[assembly: AssemblyVersion("2.0")] +[assembly: AssemblyFileVersion("2.0")] diff --git a/csharp/Tests.Common/Properties/AssemblyInfo.cs b/csharp/Tests.Common/Properties/AssemblyInfo.cs index 74717c60..d315980a 100644 --- a/csharp/Tests.Common/Properties/AssemblyInfo.cs +++ b/csharp/Tests.Common/Properties/AssemblyInfo.cs @@ -35,5 +35,5 @@ // You can specify all the values or you can default the Build and Revision Numbers // by using the '*' as shown below: // [assembly: AssemblyVersion("1.0.*")] -[assembly: AssemblyVersion("1.0.0.0")] -[assembly: AssemblyFileVersion("1.0.0.0")] +[assembly: AssemblyVersion("2.0")] +[assembly: AssemblyFileVersion("2.0")] diff --git a/csharp/Utils/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs b/csharp/Utils/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs index d1988d52..127269b5 100644 --- a/csharp/Utils/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs +++ b/csharp/Utils/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs @@ -32,5 +32,5 @@ // You can specify all the values or you can default the Build and Revision Numbers // by using the '*' as shown below: // [assembly: AssemblyVersion("1.0.*")] -[assembly: AssemblyVersion("1.6.0.0")] -[assembly: AssemblyFileVersion("1.6.0.0")] +[assembly: AssemblyVersion("2.0")] +[assembly: AssemblyFileVersion("2.0")] diff --git a/csharp/Worker/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs b/csharp/Worker/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs index 2f8b77e1..2b9547ad 100644 --- a/csharp/Worker/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs +++ b/csharp/Worker/Microsoft.Spark.CSharp/Properties/AssemblyInfo.cs @@ -30,5 +30,5 @@ // Build Number // Revision // -[assembly: AssemblyVersion("1.6.1.0")] -[assembly: AssemblyFileVersion("1.6.1.0")] +[assembly: AssemblyVersion("2.0")] +[assembly: AssemblyFileVersion("2.0")] diff --git a/csharp/WorkerTest/Properties/AssemblyInfo.cs b/csharp/WorkerTest/Properties/AssemblyInfo.cs index df526bcc..eea728bd 100644 --- a/csharp/WorkerTest/Properties/AssemblyInfo.cs +++ b/csharp/WorkerTest/Properties/AssemblyInfo.cs @@ -32,5 +32,5 @@ // You can specify all the values or you can default the Build and Revision Numbers // by using the '*' as shown below: // [assembly: AssemblyVersion("1.0.*")] -[assembly: AssemblyVersion("1.6.1.0")] -[assembly: AssemblyFileVersion("1.6.1.0")] +[assembly: AssemblyVersion("2.0")] +[assembly: AssemblyFileVersion("2.0")]