Monday, November 9, 2015

Data Warehouse ETL Scripting

In this section we are going to create SQL script for ETL process to fill the Dim and Fact tables of Data Warehouse with data from OLTP database tables. The process includes the following steps.

1. Plan and document ETL process in BI documentation excel sheet to match the respective OLTP, OLAP fields with ETL scripts.

2. Here we use Flush and Fill ETL process, so first step is to drop the foreign constraints and clear the tables. ( this is called flush the tables)

3. Truncate all the Dim and Fact tables in Data Warehouse to free the allocated space in a memory.

4. Fill the Dim and Fact tables with data from OLTP tables with casting where ever needed to match the data type between two respective tables from OLTP and OLAP.

5. Re-create the foreign key constraints on the tables in the Data Warehouse.

6. Review and confirm that data has been filled in the Dim and Fact tables in Data Warehouse solution.

The following picture shows the planning sheet of ETL process.


The below SQL script describes more detail about ETL process step by step.

/**************************************************************
 Create the Data Warehouse
*************************************************************/

--****************** [DWNorthwindLite] *********************--
-- This file contains ETL code for use with the
-- [DWNorthwindLite] database.
--****************** Your Version ***************************--

USE [DWNorthwindLite]
Go

--********************************************************************--
-- 1) Drop the FOREIGN KEY CONSTRAINTS and Clear the tables
--********************************************************************--
Alter Table [DWNorthwindLite].dbo.FactOrders
Drop Constraint [FK_FactOrders_DimProducts]

Alter Table [DWNorthwindLite].dbo.FactOrders
Drop Constraint [FK_FactOrders_DimCustomers]

Alter Table [DWNorthwindLite].dbo.FactOrders
Drop Constraint [FK_FactOrders_DimDates]


Truncate Table [DWNorthwindLite].dbo.DimProducts
Truncate Table [DWNorthwindLite].dbo.DimCustomers
Truncate Table [DWNorthwindLite].dbo.DimDates
Truncate Table [DWNorthwindLite].dbo.FactOrders


--********************************************************************--
-- 2) FILL the Tables
--********************************************************************--


/****** [dbo].[DimProducts] ******/
INSERT INTO [DWNorthwindLite].dbo.DimProducts
SELECT
 [ProductID] = Products.ProductID
,[ProductName] = CAST(Products.ProductName as nVarchar(100))
,[ProductCategoryID] = Products.CategoryID
,[ProductCategoryName] = CAST(Categories.CategoryName as nVarchar(100))
,[StartDate] = 20000101
,[EndDate] = Null
,[IsCurrent] = 'Yes'
FROM [NorthwindLite].dbo.Categories
INNER JOIN [NorthwindLite].dbo.Products
ON Categories.CategoryID = Products.CategoryID;
Go

/****** [dbo].[DimCustomers] ******/
INSERT INTO DimCustomers
SELECT [CustomerID] = [NorthwindLite].[dbo].[Customers].[CustomerID]
,[CustomerName] = CAST([NorthwindLite].[dbo].[Customers].[CompanyName] as NVARCHAR(100))
,[CustomerCity] = CAST([NorthwindLite].[dbo].[Customers].[City] as NVARCHAR(100))
,[CustomerCountry] = CAST([NorthwindLite].[dbo].[Customers].[Country] as nvarchar(100))
,[StartDate] = 20000101
,[EndDate] = NULL
,[IsCurrent] = 'YES'
From [NorthwindLite].[dbo].[Customers]
Go

/****** [dbo].[DimDates] ******/
DECLARE @DateKey Date
DECLARE @EndDate Date
SET @DateKey = '1/1/1950'
SET @EndDate = '12/31/2050'
WHILE (@DateKey <= @EndDate)
BEGIN
INSERT INTO DimDates
    SELECT [DateKey] = Cast( Convert(nVarchar(100), @DateKey, 112) as int)
       , [USADateName] = Convert(nVarchar(100), @DateKey, 110)
       , [MonthKey] = Cast( Convert(nVarchar(100), DatePart(MONTH,@DateKey)) as int)
       , [MonthName] = Convert(nVarchar(100), DateName(MONTH,@DateKey), 110)
       , [QuarterKey] = Cast( Convert(nVarchar(100), DatePart(QUARTER,@DateKey), 110) as int)
       , [QuarterName] = Convert(nVarchar(100), DateName(QUARTER,@DateKey), 110)
       , [YearKey] = Cast( Convert(nVarchar(100), DatePart(Year,@DateKey), 110) as int)
       , [YearName] = Convert(nVarchar(100), DateName(Year,@DateKey), 110)

       SET @DateKey = DATEADD(DAY,1, @DateKey)
END
Go

/****** [dbo].[FactOrders] ******/
INSERT INTO [DWNorthwindLite].[dbo].[FactOrders]
SELECT [OrderID] = [NorthwindLite].[dbo].[Orders].[OrderID]
,[CustomerKey] = [DWNorthwindLite].[dbo].[DimCustomers].[CustomerKey]
,[OrderDateKey] = CAST(CONVERT(nvarchar(50),[NorthwindLite].[dbo].[Orders].[OrderDate], 112) as int)
,[ProductKey] = [DWNorthwindLite].[dbo].[DimProducts].[ProductKey]
,[ActualOrderUnitPrice]= [NorthwindLite].[dbo].[OrderDetails].[UnitPrice]
,[ActualOrderQuantity] = CAST([NorthwindLite].[dbo].[OrderDetails].[Quantity] as smallint)
FROM [NorthwindLite].[dbo].[Orders]
INNER JOIN
[NorthwindLite].[dbo].[OrderDetails]
ON
[NorthwindLite].[dbo].[Orders].[OrderID] = [NorthwindLite].[dbo].[OrderDetails].[OrderID]
INNER JOIN
[DimCustomers]
ON
[NorthwindLite].[dbo].[Orders].[CustomerID] = [DimCustomers].[CustomerID]
INNER JOIN
[DimProducts]
ON
[NorthwindLite].[dbo].[OrderDetails].[ProductID] = [DimProducts].[ProductID]
Go

--********************************************************************--
-- 3) Re-Create the FOREIGN KEY CONSTRAINTS
--********************************************************************--
ALTER TABLE DWNorthwindLite.dbo.FactOrders
  ADD CONSTRAINT FK_FactOrders_DimProducts
  FOREIGN KEY (ProductKey) REFERENCES DimProducts(ProductKey)

 ALTER TABLE DWNorthwindLite.dbo.FactOrders
  ADD CONSTRAINT FK_FactOrders_DimCustomers
  FOREIGN KEY (CustomerKey) REFERENCES DimCustomers(CustomerKey)

 ALTER TABLE DWNorthwindLite.dbo.FactOrders
  ADD CONSTRAINT FK_FactOrders_DimDates
  FOREIGN KEY (OrderDateKey) REFERENCES DimDates(DateKey)


--********************************************************************--
-- Review the results of this script
--********************************************************************--
Select * from [dbo].[DimProducts]
Select * from [dbo].[DimCustomers]
Select * from [dbo].[DimDates]
Select * from [dbo].[FactOrders]

This concludes the ETL process using SQL scripting for a data Warehouse solution.

Tuesday, October 27, 2015

Creation of OLAP Data Warehouse Solution from OLTP Database


While creating OLAP data Warehouse solution in this NorthwindLite database example, I have created Products, Customers and Date dimension table. Since Product Category does not have very critical fields to make separate table, I merged product category fields with Product dimension table and make it one Products dimension table. Next I have created Customers dimension table where I skipped Customer Address field which is less important. Later I created FactOrders fact table where I created surrogate keys and primary keys.

I created date dimension table and also added OrderDatekey to the factorders table.

once the data warehouse plan worksheet has done I created the DWNorthwindLite Data Warehouse solution.


OLTP vs. OLAP Databases

OLTP
  1.         Transactions such as Insert, update, delete.
  2.         Usually a Entity-relationship model.
  3.       Normalized database with lots of tables and not much duplicated data.
  4.          Used for storing detailed and up to date data.
  5.          Lots of transactions that is constantly updating.
  6.           Data may be collected from multiple points of entry, e.g. web application, desktop application, point-of-sale.
  7.      OLTP databases are primarily used to collect and store transaction data from a variety of inputs.
OLAP
  1.          Mainly select queries with very little updates.
  2.          Star or snowflake model.
  3.       De-normalized so data is aggregated in fact and dim tables.
  4.          Mainly used for data mining and reporting.
  5.          Often complex queries that include aggregation.
  6.          Needs fast response time.
  7.      OLAP databases are primarily used for reporting and analysis of data.  
  8.      Serves the needs of analysts, managers and executive employees in planning, problem solving and decision support.

Monday, October 26, 2015

Creating Northwind Lite Data Warehouse




In this blog we are going to create Northwind Lite Data Warehouse solution and the below plan shows the objects in the data warehouse solution.



First step in creation of data warehouse is to create data warehouse in this example we will create data warehouse DWNorthwindLite and then create the dimension, fact and Date dimension tables within this data warehouse.
The following script accomplishes the task of creating data warehouse and defining the tables.

/**************************************************************
 Create the Data Warehouse
*************************************************************/

--****************** [DWNorthwindLite] *********************--
-- This file will drop and create the [DWNorthwindLite]
-- database, with all its objects.
--****************** Instructors Version ***************************--

USE [master]
GO
If Exists (Select * from Sysdatabases Where Name = 'DWNorthwindLite')
            Begin
                        ALTER DATABASE [DWNorthwindLite] SET SINGLE_USER WITH ROLLBACK IMMEDIATE
                        DROP DATABASE [DWNorthwindLite]
            End
GO
Create Database [DWNorthwindLite]
Go

--********************************************************************--
-- Create the Tables
--********************************************************************--
USE [DWNorthwindLite]
Go

SET ANSI_NULLS ON
GO

SET QUOTED_IDENTIFIER ON
GO

SET ANSI_PADDING ON
GO

/****** [dbo].[DimProducts] ******/
CREATE TABLE [dbo].[DimProducts](
            [ProductKey] [int] NOT NULL PRIMARY KEY Identity,
            [ProductId] [int] NOT NULL,
            [ProductCategoryId] [int] NOT NULL,
            [ProductName] [nvarchar](100) NOT NULL,
            [ProductCategoryName] [nvarchar](100) NOT NULL,
            [StartDate] [date] NULL,
            [EndDate] [date] NULL,
            [Iscurrent] [int] NULL,
            )
Go

/****** [dbo].[DimCustomers] ******/
CREATE TABLE [dbo].[DimCustomers](
            [CustomerKey] [int] NOT NULL PRIMARY KEY Identity,
            [CustomerID] [nvarchar] (5) NULL,
            [CustomerName] [nvarchar](100) NULL,
            [CustomerCity] [nvarchar](100) NULL,
            [CustomerCountry] [nvarchar](100) NULL,
            [StartDate] [int] NULL,
            [EndDate] [int] NULL,
            [Iscurrent] [nvarchar](3) NULL,
)
Go

/****** [dbo].[DimDates] ******/
CREATE TABLE dbo.DimDates (
            [DateKey] int NOT NULL PRIMARY KEY IDENTITY,
            [USADateName] nVarchar(100) NULL,
            [MonthKey] int NOT NULL,
            [MonthName] nVarchar(100) NULL,
            [QuarterKey] int NOT NULL,
            [QuarterName] nVarchar(100) NULL,
            [YearKey] int NULL,
            [YearName] nVarchar(100) NULL,
)
Go

/****** [dbo].[FactOrders] ******/
CREATE TABLE dbo.FactOrders (
            [OrderKey] [int] NOT NULL,
            [OrderID] [int] NULL,
            [OrderDateKey] [int] NOT NULL,
            [CustomerKey] [int] NOT NULL,
            [ProductKey] [int] NOT NULL,
            [ActualOrderUnitPrice] [money] NOT NULL,
            [ActualOrderQuantity] [int] NOT NULL,
 CONSTRAINT [PK_FactOrders] PRIMARY KEY CLUSTERED
            ( [OrderKey] ASC, [OrderDateKey] ASC, [CustomerKey] ASC, [ProductKey] ASC )
)

Go

Second step is define key constraints including primary and foreign key relationship between tables and objects within it according to the objects defined in the plan document.

--********************************************************************--
-- Create the FOREIGN KEY CONSTRAINTS
--********************************************************************--
Alter Table [dbo].[FactOrders] With Check Add Constraint [FK_FactOrders_DimProducts]
Foreign Key  ([ProductKey]) References [dbo].[DimProducts] ([ProductKey])

Alter Table [dbo].[FactOrders] With Check Add Constraint [FK_FactOrders_DimCustomers]
Foreign Key  ([CustomerKey]) References [dbo].[DimCustomers] ([CustomerKey])

Alter Table [dbo].[FactOrders]  With Check Add Constraint [FK_FactOrders_DimDates]
Foreign Key ([OrderDateKey]) References [dbo].[DimDates] ( [DateKey] )
Go

The below screenshot shows, the tables and the data warehouse tree diagram generated in object explorer.




Third step is to generate database diagram showing the full data warehouse solution in graphical format.


The below scripts describes the backup and restoration of data warehouse solution.

/**
1) Make a copy of the empty database before startig the ELT Process
**/

BACKUP DATABASE [DWNorthwindLite]
TO DISK =
N'E:\_DWCert\Assignment Submission\Assignment03_Shubhada\DWNorthwindLite\DWNorthwindLiteBackup.bak'
GO

/**
2) Send the file to other team members and tell them they can restore the database with this code...
**/

-- Check to see if they alread hav a database with that name...
IF EXISTS (SELECT name FROM sys.databases WHERE name=N'DWNorthwindLite')
 BEGIN
 -- If they do, they need to close connections to the DWNorthwindLite database, with this code.
  ALTER DATABASE [DWNorthwindLite] SET SINGLE_USER WITH ROLLBACK IMMEDIATE
 END

-- Now they can restore the empty database...
USE MASTER
RESTORE DATABASE [DWNorthwindLite]
FROM DISK =
N'E:\_DWCert\Assignment Submission\Assignment03_Shubhada\DWNorthwindLite\DWNorthwindLiteBackup.bak'
WITH REPLACE
GO

The below shown QA report confirms that the database object conform to the warehouse planning documents.




Finally we will include all the data warehouse solution and it's related planning documents in one solution place in Visual Studio.

The below screenshot shows the complete data warehouse solution in one place.


Monday, October 19, 2015

BI Solution Planning and Data Warehouse Design

Business intelligence (BI) is a process for analyzing data and presenting actionable information to help corporate executives, business managers and other end users make more informed business decisions.

Figure: Business Intelligence Process

BI encompasses a variety of tools, applications and methodologies that enable organizations to collect data from internal systems and external sources using SQL Server Integration Service (SSIS) and Extract Load and Transform (ETL) process, prepare it for analysis using SQL Server Analysis Service (SSAS), develop and run queries against the data, and create reports using SQL Server Reporting Service (SSRS), dashboards and data visualizations to make the analytical results available to corporate decision makers as well as operational workers.


The potential benefits of business intelligence programs include accelerating and improving decision making; optimizing internal business processes; increasing operational efficiency; driving new revenues; and gaining competitive advantages over business rivals. BI systems can also help companies identify market trends and spot business problems that need to be addressed.
The first step in designing Business Intelligence solution is Planning phase where BI Consultants and Managers fills up the BI planning worksheet including the information about Roles, Solution Schedule, Estimated hours etc.

The planning phase starts with the interview, documenting the requirements, locating different data sources and data, deciding on teams and roles,  timelines for each and every step of planning, scheduling the BI plan, Deciding on nice to have data, must have features and not important information, scope of the requirement, deciding on achievable and non-achievable goals, user training etc.






There are 7 important points any company should consider before they go for BI Solution design to have effective solution for their organization and they are,
  • Storage and computing hardware: Firms will need to invest or upgrade their data storage infrastructure.
  • Applications and data sources: Source data will need to be scrubbed and organized.
  • Data integration : Firms will need to invest in middle ware "connectors" to allow data from source applications to be integrated with the BI repository.
  • OLAP applications and analytic engines: OLAP provide a layer of separation between the storage repository and the end user's analytic application and it  performs special analytical functions that require high-performance processing power.
  • Analytic applications: Analytic applications are the programs used to run queries against the data to perform "drill-down" analysis.
  • Information presentation and delivery products enterprises can purchase packaged or custom reporting products, such as Crystal Reports or leverage XML to deliver analyses through a portal or any other Internet-enabled interface for example PDA.
In this blog we are going to discuss a scenario of Yogurt Ice Cream Company seeking information on their sales of different flavors and toppings, popularity of flavors and toppings in an area.


After requirement gathering and collecting source data through interviews and other resources, the next step is design a Data Warehouse and tables.

One can design Data Warehouse in either Star Schema (data in non normalized form, speedy performance in querying data) or Snowflake schema (normalized form) or hybrid of both the design schema. Both schema use Fact and Dimension tables.

Fact tables references to dimensions and contains additive, non additive or calculated measures.

Dimension tables stores attributes or dimensions that describes objects in fact table.

Ice Cream Company Sales Data: Building Data Warehouse


IF EXISTS (SELECT *FROM sys.sysdatabases WHERE name = 'SalesIceCream_DW')
BEGIN
       DROP DATABASE SalesIceCream_DW
END
GO

 -- CREATE A DATABASE
Create database SalesIceCream_DW
Go

-- CREATE DIMENSION TABLES
Use SalesIceCream_DW
GO

CREATE TABLE DimFlavor
(FlavorKey int identity NOT NULL PRIMARY KEY NONCLUSTERED,
 FlavorAltKey nvarchar(10) NOT NULL,
 FlavorName nvarchar(50) NULL,
 FlavorCategoryName nvarchar(50))
Go

CREATE TABLE DimToppings
(ToppingsKey int identity NOT NULL PRIMARY KEY NONCLUSTERED,
 ToppingsAltKey nvarchar(10) NOT NULL,
 ToppingsName nvarchar(50) NULL,
 ToppingsCategoryName nvarchar(50))
GO

CREATE TABLE DimCustomer
(CustomerKey int identity NOT NULL PRIMARY KEY NONCLUSTERED,
 CustomerAltKey nvarchar(10) NOT NULL,
 CustomerName nvarchar(50) NULL,
 CustomerGender nvarchar(10) NULL)
GO

 CREATE TABLE DimDate
 (DateKey int NOT NULL PRIMARY KEY NONCLUSTERED,
  DateAltKey datetime NOT NULL,
  CalendarYear int NOT NULL,
  CalendarQuarter int NOT NULL,
  MonthOfYear int NOT NULL,
  [MonthName] nvarchar(15) NOT NULL,
  [DayOfMonth] int NOT NULL,
  [DayOfWeek] int NOT NULL,
  [DayName] nvarchar(15) NOT NULL,
  FiscalYear int NOT NULL,
  FiscalQuarter int NOT NULL)
GO

CREATE TABLE FactSalesOrder
(FlavorKey int NOT NULL REFERENCES DimFlavor(FlavorKey),
 ToppingsKey int NOT NULL REFERENCES DimToppings(ToppingsKey),
 CustomerKey int NOT NULL REFERENCES DimCustomer(CustomerKey),
 OrderDateKey int NOT NULL REFERENCES DimDate(DateKey),
 OrderNo int NOT NULL,
 ItemNo int NOT NULL,
 Quantity int NOT NULL,
 Price money NOT NULL,
 SalesAmount money NOT NULL
  CONSTRAINT [PK_FactSalesOrder] PRIMARY KEY NONCLUSTERED
(
  [FlavorKey],[ToppingsKey],[CustomerKey],[OrderDateKey],[OrderNo],[ItemNo]
)
)
GO

CREATE TABLE DimStore
 (StoreKey int identity NOT NULL PRIMARY KEY NONCLUSTERED,
  PostalCode nvarchar(15) NULL,
  City nvarchar(50) NULL,)
GO

-- POPULATE THE TIME DIMENSION TABLE
DECLARE @StartDate datetime
DECLARE @EndDate datetime
SET @StartDate = '01/01/2011'
SET @EndDate = getdate()
DECLARE @LoopDate datetime
SET @LoopDate = @StartDate
WHILE @LoopDate <= @EndDate
BEGIN
  INSERT INTO dbo.DimDate VALUES
       (
              CAST(CONVERT(VARCHAR(8), @LoopDate, 112) AS int) , -- date key
              @LoopDate, -- date alt key
              Year(@LoopDate), -- calendar year
              datepart(qq, @LoopDate), -- calendar quarter
              Month(@LoopDate), -- month number of year
              datename(mm, @LoopDate), -- month name
              Day(@LoopDate),  -- day number of month
              datepart(dw, @LoopDate), -- day number of week
              datename(dw, @LoopDate), -- day name of week
              CASE
                     WHEN Month(@LoopDate) < 7 THEN Year(@LoopDate)
                     ELSE Year(@Loopdate) + 1
               END, -- Fiscal year (assuming fiscal year runs from Jul to June)
               CASE
                     WHEN Month(@LoopDate) IN (1, 2, 3) THEN 3
                     WHEN Month(@LoopDate) IN (4, 5, 6) THEN 4
                     WHEN Month(@LoopDate) IN (7, 8, 9) THEN 1
                     WHEN Month(@LoopDate) IN (10, 11, 12) THEN 2
               END -- fiscal quarter
       )               
       SET @LoopDate = DateAdd(dd, 1, @LoopDate)


Create Database:



Create Dimension Table:



Create Fact Table and create relation between Fact Table and Dimension Table:


Data Warehouse Diagram ( Snowflake Schema)



Last but not least we add all the planning documents including BIsolutionPlan.docx and BISolutionWorksheet.xlsx documents to the Solution folder in Visual Studio 2013.

Here we complete the data warehouse design and planning.