/* Former vice president of Oracle "No major application will run in 3nf." Normalization is an optimization technique. It defines structures whereby redundancy is restricted to the foreign keys. pp. 148 If we don't optimize our database design, we have to handle the resulting problems in our applications. Just because you don't normalize to 3nf, it doesn't follow that the problems associated with unnormalized relations go away. And so... we need to understand what those problems are 1. Data redundancy 2. Modification anomalies (ambiguity/data loss/integrity loss) Normalization can be seen from two perspectives -Top-down verification -bottom-up normalization */ USE NormalizationChapter5 --This relation does not conform to the attributes of a relation listed --on page 62 SELECT * FROM RPT_FORMAT SELECT * FROM Data_Org_1nf --pp. 150 /* Three types of anomalies 1. update 2. insert 3. delete */ SELECT * FROM Data_Org_1nf WHERE Emp_Num = 105 SELECT * FROM Data_Org_1nf Order by Job_Class UPDATE Data_Org_1nf SET Job_Class = 'Database Web Designer' WHERE Emp_Num = 105 --Reverse operation UPDATE Data_Org_1nf SET Job_Class = 'Database Designer' WHERE Emp_Num = 105 /* There are at least three interpretations of this transaction 1. User is adding a new job_class and assigning this class to Alice 2. I'm changing the job class 'Database Engineer' to 'Database Web Designer' 3. Or did the user update the row to an incorrect value (domain violation) The reason the transaction is ambiguous is that the relation contains more than one theme or entity, viz., job, employee, assignment, and project. In the above UPDATE statement, it isn't clear (potentially) which theme is changed. */ --Insert anomaly INSERT INTO Data_Org_1nf (Emp_Num, Emp_Name, Job_Class) VALUES(445, 'Hari Ravuri', 'Programmer') SELECT * FROM Data_Org_1nf DELETE Data_Org_1nf WHERE Emp_Name = 'Hari Ravuri' /* The insert anomaly is that we can't insert an employee without assigning them to a project. Or, if we do, we have to introduce NULL values to candidate keys or create dummy values. */ /* Delete anomaly */ SELECT * FROM Data_Org_1nf WHERE Emp_Num = 103 SELECT * FROM Data_Org_1nf DELETE Data_Org_1nf WHERE Emp_Num = 103 /* The anomaly is that if we delete 103, we delete the only row for 'Elect. Engineer' The problem is that if you delete one theme, you potentially delete other themes and if it is the only occurrence, you lose data. */ --Step 2: identify the key (Proj_Num, Emp_Num) /* We can verify the uniqueness of the key by visually verifying that any given combination of Proj_Num and Emp_Num returns only one row. Or we can check for all combinations with a GROUP BY query. Our authors suggest we check the combination (Proj_Num, Emp_Num) (15, 103) pp. 153 */ SELECT Proj_Num, Emp_Num, * FROM Data_Org_1nf ORDER BY Proj_Num, Emp_Num --The following query returns only one row. This would be true of all combinations SELECT * FROM Data_Org_1nf WHERE Proj_Num=15 AND Emp_Num = 103 --We can check all combinations thus... SELECT COUNT(*) AS GrpCnt, Proj_Num, Emp_Num FROM Data_Org_1nf GROUP BY Proj_Num, Emp_Num HAVING COUNT(*) > 1 --We can verify that Proj_Num and Emp_Num are not individually unique with --For example, there are 5 occurrences of Proj_Num 15 SELECT COUNT(*) AS GrpCnt, Proj_Num FROM Data_Org_1nf GROUP BY Proj_Num HAVING COUNT(*) > 1 --Or we can do a distinct count --If Proj_Num is unique, the result would be the number of rows. --That is, there would be as many DISTINCT values as there are rows. SELECT COUNT(DISTINCT Proj_Num + Emp_Num) AS DistinctProj_NumValues FROM Data_Org_1nf SELECT COUNT(*) FROM Data_Org_1nf /* We could run the query for other possible candidate keys. For example, Proj_Num and Job_Class. There are two occurrences of (15, Database Designer) and (25, Systems Analyst) */ SELECT COUNT(*) AS GrpCnt, Proj_Num, Job_Class FROM Data_Org_1nf GROUP BY Proj_Num, Job_Class HAVING COUNT(*) > 1 /* We want to identify the primary key so we can identify functional dependencies. Functional dependencies point out situations where you have more than one theme in a relation and will potentially have modification anomalies. */ -- Step 3. Identify all dependencies -- pp. 153, is proj_num associated with only one proj_name? -- We could test each value individually. For example Project 15 SELECT * FROM Data_Org_1nf WHERE Proj_Num = 15 /* Or we could check using a GROUP BY and check all instances SELECT COUNT(DISTINCT dependent), determinant FROM table(s) GROUP BY determinant HAVING COUNT(DISTINCT dependent) > 1 The following query will check that there is only one (1) DISTINCT value for Proj_Name for each Proj_Num */ SELECT COUNT(DISTINCT Proj_Name) AS DistinctProj_NameValues, Proj_Num FROM Data_Org_1nf GROUP BY Proj_Num HAVING COUNT(DISTINCT Proj_Name) > 1 -- There is only one DISTINCT Proj_Name value for each Proj_Num value so -- Proj_Num --> Proj_Name pp. 153 -- pp. 153. What Emp_Num determine? -- Emp_Num --> {Emp_Name, Job_Class, Chg_Hour} -- Check for an individual value, say 105 SELECT * FROM Data_Org_1nf ORDER BY Emp_Num SELECT * FROM Data_Org_1nf WHERE Emp_Num = 105 --We can check all values with a GROUP BY SELECT COUNT(DISTINCT Emp_Name + Job_Class + CAST(Chg_Hour AS NVARCHAR)) AS DistinctProj_NameValues, Emp_Num FROM Data_Org_1nf GROUP BY Emp_Num HAVING COUNT(DISTINCT Emp_Name + Job_Class + CAST(Chg_Hour AS NVARCHAR)) > 1 --We don't get 1 DISTINCT value because the "*" character in Emp_Name --Does job_class determine chg_hour? pp. 154 SELECT * FROM Data_Org_1nf WHERE Job_Class = 'programmer' -- Job_Class --> Chg_Hour SELECT COUNT(DISTINCT Chg_Hour) AS DistinctChg_HourValues, Job_Class FROM Data_Org_1nf GROUP BY Job_Class HAVING COUNT(DISTINCT Chg_Hour) > 1 SELECT Proj_Num, Emp_Num,* FROM Data_Org_1nf /* Partial dependencies Proj_Num --> Proj_Name Emp_Num --> {Emp_Name, Job_Class, Chg_Hour} Transitive dependency Emp_Num --> Job_Class --> Chg_Hour P1. Emp_Num --> Job_Class P2. Job_Class --> Chg_Hour Ergo: Emp_Num --> Chg_Hour We can test Emp_Num --> Job_Class */ SELECT COUNT(DISTINCT Job_Class) AS DistinctJob_ClassValues, Emp_Num FROM Data_Org_1nf GROUP BY Emp_Num HAVING COUNT(DISTINCT Job_Class) > 1 SELECT COUNT(DISTINCT Chg_Hour) AS DistinctJob_ClassValues, Job_Class FROM Data_Org_1nf GROUP BY Job_Class HAVING COUNT(DISTINCT Chg_Hour) > 1 /* The only attribute in the table that is determined by the primary key and nothing but the primary key is Hours: Primay Key (Proj_Num, Emp_Num) --> Hours Partial dependencies Proj_Num --> Proj_Name Emp_Num --> {Emp_Name, Job_Class, Chg_Hour} Through normalization we have established that there are two themes in our table Conversion to 2NF pp. 155 Project ---< Assignment >--- Employee */ SELECT DISTINCT Proj_Num, Proj_Name FROM Data_Org_1nf AS Project SELECT DISTINCT Emp_Num, Emp_Name, Job_Class, Chg_Hour FROM Data_Org_1nf AS Employee SELECT Proj_Num, Emp_Num, Hours FROM Data_Org_1nf AS Assignment --Conversion to 3NF pp. 157 /* We have to deal with the transitive dependency Emp_Num --> Job_Class --> Chg_Hour in Employee Create two relations that removes the transitive dependency Project ---< Assignment >--- Employee >--- Job The new entities are: */ SELECT DISTINCT Job_Class, Chg_Hour FROM Data_Org_1nf AS Job SELECT DISTINCT Emp_Num, Emp_Name, Job_Class FROM Data_Org_1nf AS Employee SELECT DISTINCT Proj_Num, Proj_Name FROM Data_Org_1nf AS Project SELECT Proj_Num, Emp_Num, Hours FROM Data_Org_1nf AS Assignment --Problems from back of chapter SELECT * FROM tablep51 --step 1. Identify the key --step 2. Identify dependent attributes --step 3. Remove dependent attributes to new tables /* The source of the problem for non-3NF form relations is that the relation contains attributes from more than one theme. The rules of normalization can be used to create relations that contain one theme and one theme only. We normally do this in a top-down way by analyzing the scenario and identifying entities. Entities should be about only one theme (e.g., Student). */ --1. Identify the candidate key -- Is INV_NUM a candidate key? Does it uniquely identify each row? SELECT COUNT(DISTINCT INV_NUM) AS PkRowCount FROM tablep51 --No, there are three distinct values for Inv_Num. There should only be --as many distinct values as there are rows in the table --How about the combination INV_NUM, PROD_NUM --You can test it for at least one case with a query SELECT * FROM TableP51 WHERE Inv_Num=211347 AND Prod_Num = 'AA-E3422QW' SELECT COUNT(DISTINCT CAST(INV_NUM AS NVARCHAR) + PROD_NUM) AS PkRowCount FROM tablep51 SELECT COUNT(*) AS PkRowCount, INV_NUM, PROD_NUM FROM tablep51 GROUP BY INV_NUM, PROD_NUM HAVING COUNT(*) > 1 --two partial dependencies --one transitive dependency SELECT * FROM tablep51 --Step 3. remove dependent attributes --inv_num --> sale_date (Partial dependency) Only depends on part of the key SELECT DISTINCT Inv_Num, Sale_DAte FROM TableP51 SELECT COUNT(DISTINCT Sale_DAte) AS DistinctSale_DateValues, Inv_Num FROM TableP51 GROUP BY Inv_Num HAVING COUNT(DISTINCT Sale_DAte) > 1 -- What about Prod_Num --> SaleDate? (just to check) SELECT COUNT(DISTINCT Sale_DAte) AS DistinctSale_DateValues, Prod_Num FROM TableP51 GROUP BY Prod_Num HAVING COUNT(DISTINCT Sale_DAte) > 1 /* From the sample data it looks as if there is a depedency. But I rather suspect that if we had more data, we would find a Prod_Num sold on a different sale_date. The sample data only contain three invoices... I say we dismiss this inference as spurious. */ -- Prod_num --> {prod_label, prod_price} (Partial dependency) only depends on part of the key SELECT DISTINCT prod_num, prod_label, prod_price FROM TableP51 SELECT COUNT(DISTINCT prod_label + CAST(prod_price AS NVARCHAR)) AS DistinctValues, prod_num FROM TableP51 GROUP BY prod_num HAVING COUNT(DISTINCT prod_label + CAST(prod_price AS NVARCHAR)) > 1 SELECT * FROM TableP51 -- Vend_code --> vend_name ? SELECT DISTINCT vend_code, vend_name FROM TableP51 SELECT COUNT(DISTINCT vend_name) AS DistinctValues, vend_code FROM TableP51 GROUP BY vend_code HAVING COUNT(DISTINCT vend_name) > 1 -- Products are supplied by a vendor? -- Prod_Num --> Vend_code --> vend_name SELECT COUNT(DISTINCT Vend_code) AS DistinctValues, Prod_Num FROM TableP51 GROUP BY Prod_Num HAVING COUNT(DISTINCT Vend_code) > 1 /* inv_num --> sale_date Prod_num --> {prod_label, prod_price} Vend_code --> vend_name Prod_Num --> Vend_code --> vend_name Conversion to 2NF pp. 155 Product ---< Invoice */ SELECT * FROM TableP51 SELECT DISTINCT Prod_Num, Prod_Label, Prod_Price, Vend_Code, Vend_Name FROM TableP51 AS Product SELECT DISTINCT Inv_Num, Prod_Num, Sale_Date, Quant_Sold FROM TableP51 AS Invoice --Conversion to 3NF pp. 157 /* We have to deal with the transitive dependency Prod_Num --> Vend_code --> vend_name Create two relations that removes the transitive dependency Vendor ---< Product ---< Invoice The new entities are: */ SELECT DISTINCT Vend_Code, Vend_Name FROM TableP51 AS Vendor SELECT DISTINCT Prod_Num, Prod_Label, Prod_Price, Vend_Code FROM TableP51 AS Product SELECT DISTINCT Inv_Num, Prod_Num, Sale_Date, Quant_Sold FROM TableP51 AS Invoice --What are the anomalies? SELECT * FROM Tablep51 /* Insert anomaly? Can't add a new product or vendor without a value for Inv_num, that is, you can't put it in the database before you sell it to someone. Delete anomaly? If you delete invoice inv_num = 211349, you may loose all information about vendor ToughGo or the price of a power drill Update anomaly? If you change the Vend_Name for the first row (211347, AA-E3422QW) to 'EverFail' was the intent to update all values of 'NeverFail' to 'EverFail', create a new vendor and assign this new vendor to Rotary Sander (and change the vendor_code), or an error */ SELECT * FROM tablep55 --step 1. Identify the key --step 2. Identify dependent attributes --step 3. Remove dependent attributes to new tables