Polybase cannot connect to HDInsight !

All attempts to connect HDInsight via Polybase must fail as MS does not support its own Hadoop Cloud PAAS. So there is no LOCATION = hdfs://HeadNodeIP:Port that will work.

Even if the MOC 20467C „Designing Self-Service Business Intelligence and Big Data Solutions“ proclaims that you can use Polybase to ingest data from HDInsight.

See modified code example

----------------------------------------------------------------
-- AdventureWorksDW2016CTP3 samples: PolyBase
-- https://thesqlpro.com/2016/04/27/polybase-i-now-pronounce-you-sql-and-hadoop/
---------------------------------------------------------------
-- This sample will show you how to query and load data from Azure blob storage
-- to AdventureWorks2016CTP3 database using PolyBase.
-- All attempts to connnect to HDInsight must fail.
SELECT SERVERPROPERTY ('IsPolybaseInstalled') AS IsPolybaseInstalled
Create database PolyBaseConnect
----------------------------------------------------------------
-- AdventureWorksDW2016CTP3 samples: PolyBase
--Download here:
--https://www.microsoft.com/en-us/download/details.aspx?id=49502
--The sample code below was obtained from the Adventure works SQL2016CTP3 demos
--I have modified them slightly and added my own code
--Flat file used in example is in the SQLServer2016CTP3Samples.zip file
--In order to copy it to your Azure Blob Storage I recommend you download
--and use Azure Storage Explorer
----------------------------------------------------------------
-- This sample will show you how to query and load data from Azure blob storage
-- to AdventureWorks2016CTP3 database using PolyBase.

USE PolyBaseConnect
go

------------------------------- Configuration ------------------------------------------------------------------

-- Specify the type of data source you want to query.
-- Option 0: Disable Hadoop connectivity
-- Option 1: Hortonworks HDP 1.3 on Windows Server
-- Option 1: Azure blob storage (WASB[S])
-- Option 2: Hortonworks HDP 1.3 on Linux
-- Option 3: Cloudera CDH 4.3 on Linux
-- Option 4: Hortonworks HDP 2.0 on Windows Server
-- Option 4: Azure blob storage (WASB[S])
-- Option 5: Hortonworks HDP 2.0 on Linux
-- Option 6: Cloudera 5.1, 5.2, 5.3, 5.4, and 5.5 on Linux
-- Option 7: Hortonworks 2.1, 2.2, and 2.3 on Linux
-- Option 7: Hortonworks 2.1, 2.2, and 2.3 on Windows Server
-- Option 7: Azure blob storage (WASB[S])
-- Choose Option 7 for Azure blob storage.
exec sp_configure 'hadoop connectivity', 7;
Reconfigure;

-- Restart SQL Server to set the changes. This will automatically restart
-- SQL Server PolyBase Engine and SQL Server PolyBase Data Movement Service.

-- Verify hadoop connectivity run_value is set to 7.
exec sp_configure 'hadoop connectivity';

------------------------------- Polybase Building Blocks --------------------------------------------------------

-- STEP 1: Create a database master key to encrypt database scoped
--credential secret in the next step.
-- Replace  with a password to encrypt the master key
--DROP MASTER KEY
CREATE MASTER KEY ENCRYPTION BY PASSWORD = 'P@ssw0rd';

-- STEP 2: Create a database scoped credential to authenticate against your
--Azure storage account.
-- Replace the  with your Azure storage account key (primary access key).
-- To find the key, open your storage account on Azure Portal
--(https://portal.azure.com/).
--DROP DATABASE SCOPED CREDENTIAL AzureStorageCredential

CREATE DATABASE SCOPED CREDENTIAL AzureStorageCredential
WITH IDENTITY = 'hdfs4hdi', --Storage account name
SECRET = '91d2yV8kRLOom7xlJnZd6q9hUI6Oxs79wLxbEY0WrYjog8ZGTk33yGV1WSJ6fRK1O2hu5BgIAfEgu205Hy3QMA=='; -- access Key

-- Create a database scoped credential only for Kerberos-secured Hadoop clusters.
-- IDENTITY: the Kerberos user name.
-- SECRET: the Kerberos password

CREATE DATABASE SCOPED CREDENTIAL HadoopUser
WITH IDENTITY = 'Trainer', Secret = 'P@ssw0rd';

select * from sys.database_credentials;

-- STEP 3: Create an external data source to specify location and credential
--for your Azure storage account.
-- Replace the  with your Azure storage blob container.
-- Replace the  with your Azure storage account name.

--DROP EXTERNAL DATA SOURCE AzurePolybaseStorage
CREATE EXTERNAL DATA SOURCE AzurePolybaseStorage
WITH (
TYPE = HADOOP,
LOCATION = 'wasbs://hdi4training@hdfs4hdi.blob.core.windows.net',
CREDENTIAL = AzureStorageCredential
);
---'wasbs://[ContainerName]@[StorageAccountName].blob.core.windows.net',

-- Create an external data source.
-- LOCATION (Required) : Hadoop Name Node IP address and port.
-- else default ports are assigned appropriate for Option 7 in exec sp_configure 'hadoop connectivity', 7; https://msdn.microsoft.com/en-us/library/dn935022.aspx
-- RESOURCE MANAGER LOCATION (Optional): Hadoop Resource Manager location to enable pushdown computation - usually headnode IP.
-- CREDENTIAL (Optional):  the database scoped credential, created above.

-- DROP EXTERNAL DATA SOURCE HadoopCluster;
CREATE EXTERNAL DATA SOURCE HadoopCluster WITH (
TYPE = HADOOP,
--LOCATION ='hdfs://168.63.129.16:8020'
--LOCATION ='hdfs://10.0.0.15'
LOCATION ='hdfs://hdi4training.azurehdinsight.net'  -- sorry HDInsight is not supported, yet
, RESOURCE_MANAGER_LOCATION = 'hdi4training.azurehdinsight.net:8050'   -- YARN/jobracker req. for pushdown
--, CREDENTIAL = HadoopUser     -- only for Kerberos
);
-- not validated yet - only at runtime

select * from sys.external_data_sources;

-- Step 4: Create an external file format to specify the layout of
--data stored in Azure blob storage.
-- The data is in a pipe-delimited text file.

CREATE EXTERNAL FILE FORMAT TextFilePipe
WITH (
FORMAT_TYPE = DelimitedText,
FORMAT_OPTIONS (FIELD_TERMINATOR = '|')
);

CREATE EXTERNAL FILE FORMAT TextFileComma
WITH (
FORMAT_TYPE = DelimitedText,
FORMAT_OPTIONS (FIELD_TERMINATOR = ',')
);

-- Create additional external file format.
-- FORMAT TYPE: Type of format in Hadoop
-- DELIMITEDTEXT,
-- RCFILE,  (in combination with SERDE_METHOD )
-- ORC, requires Hive version 0.11
-- PARQUET

CREATE EXTERNAL FILE FORMAT TextFileGzip WITH (
FORMAT_TYPE = DELIMITEDTEXT
, FORMAT_OPTIONS ( FIELD_TERMINATOR ='|'
, STRING_DELIMITER = '0x7E0x7E' -- (double quotes) as hex-value of the " char which is 0x22
-- select char(0x7E)
, DATE_FORMAT = 'yyyy-MM-dd'
, USE_TYPE_DEFAULT = TRUE -- 0 if the column is defined as a numeric column.
)                         -- Empty string "" if the column is a string column.
-- 1900-01-01 if the column is a date column.
-- FALSE   Store all missing values as NULL
, DATA_COMPRESSION = 'org.apache.hadoop.io.compress.GzipCodec' -- or DefaultCodec
);

CREATE EXTERNAL FILE FORMAT [rcfile]
WITH ( FORMAT_TYPE = RCFILE
, SERDE_METHOD = 'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'  -- ColumnarSerDe
, DATA_COMPRESSION = 'org.apache.hadoop.io.compress.DefaultCodec'
);

CREATE EXTERNAL FILE FORMAT orcfile
WITH ( FORMAT_TYPE = ORC
, DATA_COMPRESSION = 'org.apache.hadoop.io.compress.SnappyCodec'  -- DefaultCodec
);

CREATE EXTERNAL FILE FORMAT parquetfile
WITH ( FORMAT_TYPE = PARQUET
, DATA_COMPRESSION = 'org.apache.hadoop.io.compress.SnappyCodec'  -- GzipCodec
);

select * from sys.external_file_formats;

-- Step 5: Create an external table to reference data stored
--in your Azure blob storage account.
-- Specify column properties for the table.
-- Replace LOCATION:  with the relative path of your file from the blob container.
-- If the file is directly under your blob container, the location
--would simply be 'FactResellerSalesArchive.txt'.

--DROP EXTERNAL TABLE dbo.FactResellerSalesArchiveExternal
CREATE EXTERNAL TABLE dbo.FactResellerSalesArchiveExternal (
[ProductKey] [int] NOT NULL,
[OrderDateKey] [int] NOT NULL,
[DueDateKey] [int] NOT NULL,
[ShipDateKey] [int] NOT NULL,
[ResellerKey] [int] NOT NULL,
[EmployeeKey] [int] NOT NULL,
[PromotionKey] [int] NOT NULL,
[CurrencyKey] [int] NOT NULL,
[SalesTerritoryKey] [int] NOT NULL,
[SalesOrderNumber] [nvarchar](20) NOT NULL,
[SalesOrderLineNumber] [tinyint] NOT NULL,
[RevisionNumber] [tinyint] NULL,
[OrderQuantity] [smallint] NULL,
[UnitPrice] [money] NULL,
[ExtendedAmount] [money] NULL,
[UnitPriceDiscountPct] [float] NULL,
[DiscountAmount] [float] NULL,
[ProductStandardCost] [money] NULL,
[TotalProductCost] [money] NULL,
[SalesAmount] [money] NULL,
[TaxAmt] [money] NULL,
[Freight] [money] NULL,
[CarrierTrackingNumber] [nvarchar](25) NULL,
[CustomerPONumber] [nvarchar](25) NULL,
[OrderDate] [datetime] NULL,
[DueDate] [datetime] NULL,
[ShipDate] [datetime] NULL
)
WITH (
LOCATION='demo/AdventureWorksDW/FactResellerSalesArchive.txt', --- location of file/directory
DATA_SOURCE=AzurePolybaseStorage, ---
FILE_FORMAT=TextFilePipe---
);

SELECT * FROM sys.tables;
SELECT * FROM sys.external_tables;

-- Create an external table pointing to data stored in Hadoop.
-- LOCATION: path to file or directory that contains the data (relative to HDFS root).

-- DROP EXTERNAL TABLE [dbo].[External_FactInternetSales]
CREATE EXTERNAL TABLE [dbo].[External_FactInternetSales](
[ProductKey] [int] NOT NULL,
[OrderDateKey] [int] NOT NULL,
[DueDateKey] [int] NOT NULL,
[ShipDateKey] [int] NOT NULL,
[CustomerKey] [int] NOT NULL,
[PromotionKey] [int] NOT NULL,
[CurrencyKey] [int] NOT NULL,
[SalesTerritoryKey] [int] NOT NULL,
[SalesOrderNumber] [nvarchar](20) NOT NULL,
[SalesOrderLineNumber] [tinyint] NOT NULL,
[RevisionNumber] [tinyint] NOT NULL,
[OrderQuantity] [smallint] NOT NULL,
[UnitPrice] [money] NOT NULL,
[ExtendedAmount] [money] NOT NULL,
[UnitPriceDiscountPct] [float] NOT NULL,
[DiscountAmount] [float] NOT NULL,
[ProductStandardCost] [money] NOT NULL,
[TotalProductCost] [money] NOT NULL,
[SalesAmount] [money] NOT NULL,
[TaxAmt] [money] NOT NULL,
[Freight] [money] NOT NULL,
[CarrierTrackingNumber] [nvarchar](25) NULL,
[CustomerPONumber] [nvarchar](25) NULL,
[OrderDate] [datetime] NULL,
[DueDate] [datetime] NULL,
[ShipDate] [datetime] NULL
)
WITH
(
LOCATION='demo/AdventureWorksDW/External_FactInternetSales',
DATA_SOURCE = HadoopCluster, -- modified
FILE_FORMAT = TextfileComma
);

-- External Data Source information could not be retrieved due to an internal error Exception of type 'JNI.JavaExceptionCheckException' was thrown.:
-- Error [Invalid host name: local host is: (unknown); destination host is: "hdi4training.azurehdinsight.net":8050; java.net.UnknownHostException;
-- Error [Call From Preview2016/10.0.0.4 to hdi4training.azurehdinsight.net:8050 failed on socket timeout exception: org.apache.hadoop.net.ConnectTimeoutException: 20000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=hdi4training.azurehdinsight.net/13.95.151.161:8050];
-- Error [Call From Preview2016/10.0.0.4 to hdi4training.azurehdinsight.net:8020 failed on socket timeout exception: org.apache.hadoop.net.ConnectTimeoutException: 20000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=hdi4training.azurehdinsight.net/13.95.151.161:8020];

-- Manually create statistics on an external table in Hadoop.
CREATE STATISTICS StatsExternal_FactInternetSales
on [External_FactInternetSales]([SalesOrderNumber], [SalesOrderLineNumber])

-- Try running queries on your external table on Azure Blob Store.
SELECT * FROM dbo.FactResellerSalesArchiveExternal; -- returns 5000 rows. 1:08 min
SELECT * FROM dbo.FactResellerSalesArchiveExternal -- returns 1959 rows
WHERE SalesAmount > 1000;

SELECT min(OrderDate), max(OrderDate) FROM dbo.FactResellerSalesArchiveExternal; -- 2015-2015
SELECT min(OrderDate), max(OrderDate) FROM [AdventureworksDW].[dbo].[FactResellerSales]; --11.669.638 rows - 2005-2014

SELECT [SalesOrderNumber],[SalesOrderLineNumber] FROM dbo.FactResellerSalesArchiveExternal -- 2008-2014
EXCEPT
SELECT [SalesOrderNumber],[SalesOrderLineNumber] FROM [AdventureworksDW].[dbo].[FactResellerSales];

------------------------------- Load data into your database --------------------------------------------------------

-- Step 6: Load the data from Azure blob storage
--         into a new table in your database.
SELECT * INTO dbo.FactResellerSalesArchive
FROM dbo.FactResellerSalesArchiveExternal;

-- Try a select query on this table to confirm the data has been loaded correctly.
SELECT * FROM dbo.FactResellerSalesArchive;
---My customized code---
---Not part of the Adventure Works Demo---
---Load data into Polybase "table"
--Allow Polybase export
USE master;
GO
EXEC sp_configure 'show advanced option', '1';
GO
EXEC sp_configure 'allow polybase export', '1';
GO
RECONFIGURE;
EXEC sp_configure;

---Text file names are case Sensitive!!

USE PolyBaseConnect
GO

CREATE EXTERNAL TABLE ExternalArchive (
IdentityKey int,
FirstName Varchar(100),
LastName Varchar(100)
)

WITH (
LOCATION='demo/AdventureWorksDW/ArchivedData.csv',--- File ?
DATA_SOURCE = AzurePolybaseStorage,---
FILE_FORMAT = TextfileComma --- already defined
)
;

INSERT INTO ExternalArchive (IdentityKey,FirstName,LastName)
VALUES (1234, 'Willi','Winzig')

SELECT * FROM ExternalArchive

INSERT INTO ExternalArchive (IdentityKey,FirstName,LastName)
VALUES (4567, 'Oma','Müller')

--Insert as new Table

--CETAS Supported in APS and Azure SQL DW as of publishing this post,
--not supported in SQL 2016 yet
--There is no garantie that this feature will be added to SQL 2016 in the future
--This code is purely for educational purposes, and this is currently
--supported in APS and SQL DW in Azure.

CREATE EXTERNAL TABLE dbo.External_FactInternetSales
WITH (
LOCATION = 'TestPolybaseData.txt',---
DATA_SOURCE = AzurePolybaseStorage,---
FILE_FORMAT = TextfileComma ---
)
AS
SELECT * FROM AdventureWorksDW.dbo.FactInternetSales;

--- The way to do it in SQL 2016
--- instead is to create Table Structure first
-- DROP EXTERNAL TABLE [dbo].[External_FactInternetSales]
CREATE EXTERNAL TABLE [dbo].[External_FactInternetSales](
[ProductKey] [int] NOT NULL,
[OrderDateKey] [int] NOT NULL,
[DueDateKey] [int] NOT NULL,
[ShipDateKey] [int] NOT NULL,
[CustomerKey] [int] NOT NULL,
[PromotionKey] [int] NOT NULL,
[CurrencyKey] [int] NOT NULL,
[SalesTerritoryKey] [int] NOT NULL,
[SalesOrderNumber] [nvarchar](20) NOT NULL,
[SalesOrderLineNumber] [tinyint] NOT NULL,
[RevisionNumber] [tinyint] NOT NULL,
[OrderQuantity] [smallint] NOT NULL,
[UnitPrice] [money] NOT NULL,
[ExtendedAmount] [money] NOT NULL,
[UnitPriceDiscountPct] [float] NOT NULL,
[DiscountAmount] [float] NOT NULL,
[ProductStandardCost] [money] NOT NULL,
[TotalProductCost] [money] NOT NULL,
[SalesAmount] [money] NOT NULL,
[TaxAmt] [money] NOT NULL,
[Freight] [money] NOT NULL,
[CarrierTrackingNumber] [nvarchar](25) NULL,
[CustomerPONumber] [nvarchar](25) NULL,
[OrderDate] [datetime] NULL,
[DueDate] [datetime] NULL,
[ShipDate] [datetime] NULL
)
WITH
(
LOCATION='demo/AdventureWorksDW/External_FactInternetSales',---
DATA_SOURCE = AzurePolybaseStorage,---
FILE_FORMAT = TextfileComma ---
);

INSERT INTO [dbo].[External_FactInternetSales]
SELECT * FROM AdventureWorksDW.dbo.FactInternetSales;

--Join with query to data in WASB
SET STATISTICS IO ON

SELECT C.*, EFI.* FROM dbo.External_FactInternetSales EFI
INNER JOIN AdventureWorksDW.dbo.DimCustomer C
ON EFI.CustomerKey = C.CustomerKey

--
SELECT SalesOrderNumber,
avg(unitPrice) as AverageUnitPrice,
sum(TotalProductCost) as SumOfTotalProductCost,
sum(SalesAmount) as SumOfSalesAmount
FROM dbo.External_FactInternetSales
GROUP BY SalesOrderNumber;

---ExternalPushDown Works on Hadoop not Azure Blob Storage

SELECT SalesOrderNumber,
avg(unitPrice) as AverageUnitPrice,
sum(TotalProductCost) as SumOfTotalProductCost,
sum(SalesAmount) as SumOfSalesAmount
FROM dbo.External_FactInternetSales
GROUP BY SalesOrderNumber
OPTION (FORCE EXTERNALPUSHDOWN) --THIS OPTION WILL FAIL IN WASB

---Working with JSON Files
---Format as JSON File

SELECT FirstName as [Customer.FirstName],
MiddleName as [Customer.MiddleName],
LastName as [Customer.LastName],
BirthDate as [Customer.BirthDate],
MaritalStatus as [Customer.MaritalStatus],
Gender as [Customer.Gender],
EmailAddress as [Customer.EmailAddress],
YearlyIncome as [Customer.YearlyIncome],
TotalChildren as [Customer.TotalChildren],
GEO.City as [Customer.City],
GEO.StateProvinceName as [Customer.State],
GEO.PostalCode as [Customer.ZipCode]
FROM AdventureWorksDW.dbo.DimCustomer C
INNER JOIN AdventureWorksDW.dbo.DimGeography GEO
ON C.GeographyKey = GEO.GeographyKey
WHERE GEO.CountryRegionCode = 'US'
FOR JSON PATH, ROOT('Customer')

---Import from JSON File
---https://blogs.msdn.microsoft.com/sqlserverstorageengine/2015/10/07/bulk-importing-json-files-into-sql-server/
----JSON stored in Azure Blob
/*Basic Format from Public ZipCode data
{ city; : AGAWAM, loc : [ -72.622739, 42.070206 ], pop : 15338, state : MA, id : 01001 }
Important Note: id column was initially labled _id,
I did a find and replace to change it to id so it would work
Sample data obtained from http://media.mongodb.org/zips.json
*/
-- DROP EXTERNAL TABLE [dbo].[External_ZipCodes]
CREATE EXTERNAL TABLE [dbo].[External_ZipCodes](
[jsonrow] varchar(8000) NOT NULL -- single Record container
)
WITH (LOCATION = 'demo/AdventureWorksDW/zips.json',
DATA_SOURCE = AzurePolybaseStorage,
FILE_FORMAT = TextFilePipe);

--FILE_FORMAT = TextFileWithDefaults

SELECT TOP 10 [jsonrow]
FROM dbo.[External_ZipCodes]

--Then within SQL Server you can parse the JSONrow and create a view on top of it. Example Below

-- DROP VIEW dbo.[External_ZipCodes]
CREATE VIEW dbo.[External_ZipCodes] ([City], [Population], [State], [Location], [ZipCode])
AS
SELECT [city],
[pop],
[state],
geography::STGeomFromText('POINT ('+ lon + ' ' + lat + ')', 4326) as [location], --spatial data type
[id]
FROM [External_ZipCodes]
CROSS APPLY OPENJSON(jsonrow) -- Record container
WITH (id nvarchar(20), --ID is the zip code
city nvarchar(100),
pop int '$.pop',
state nvarchar(2) '$.state',
lon varchar(20) '$.loc[0]', -- index
lat varchar(20) '$.loc[1]'
);

---# of Rows in total 29467, 1 minute 35 seconds for execution
SELECT [City],
[Population],
[State],
[ZipCode],
[Location],
[location].ToString() as Coordinate,
[location].Lat as Latitude,
[location].Long as Longitude
FROM dbo.ExternalZipData
WHERE city = 'Macon';

SELECT [City],
[Population],
[State],
[ZipCode],
[Location],
[location].ToString() as Coordinate,
[location].Lat as Latitude,
[location].Long as Longitude
from dbo.ExternalZipData
WHERE state = 'NY'
---Make sure you look at the results in the spatial mapping tab

SELECT [state], count([state])
FROM dbo.ExternalZipData
GROUP BY [state]
ORDER BY count([state]) desc

---Clean up
USE PolyBaseConnect
GO

Drop EXTERNAL TABLE External_TestPolyBaseData
DROP EXTERNAL TABLE [dbo].[External_ZipCodes]
DROP EXTERNAL TABLE dbo.FactResellerSalesArchiveExternal
DROP EXTERNAL TABLE External_FactInternetSales
DROP VIEW [dbo].[ExternalZipData]
DROP TABLE [dbo].[FactResellerSalesArchive]
DROP EXTERNAL FILE FORMAT TextFilePipe
DROP EXTERNAL FILE FORMAT TextFileComma
--DROP EXTERNAL DATA SOURCE AzurePolybaseStorage
--DROP CREDENTIAL AzureStorageCredential

Schlagwörter:, , , ,

Hinterlasse einen Kommentar