Re: Indexing techniques for hierarchies

From: Matt Wigdahl <mlwigdahl_at_yahoo.com>
Date: 2 Jan 2002 08:05:30 -0800
Message-ID: <f94322ea.0201020805.7e865f6b_at_posting.google.com>


I am not familiar with XDb syntax but I am very familiar with C++, and it seems like some of your statements below are incorrect. You claim that each object was of a different class, but apparently you set up one class template and created 10000 object instances from it, each one linked to the one before, an impression reinforced by the fact that all data members for your objects were the same.

This is just a degenerate case of an adjacency list. I would certainly not approach this problem in an RDBMS by creating 10000 different tables; that would be completely feebleminded.

In my opinion a better benchmarking method to compare the performace of two DBMS would be to define some simple real-world tasks and let each "side" generate the most optimal data model and data manipulation code they can to satisfy the output requirements. Approaches such as yours ("make 10000 tables", etc.) just lead to square-peg, round-hole efforts to compare internal constructs when what is really needed is to compare tangible results independent of implementation methodology.

For the record, I would stab at the problem in in SQL Server like so (please forgive the IDENTITY statement; used for brevity):

  • begin create table and data

BEGIN TRANSACTION
SET QUOTED_IDENTIFIER ON
GO
SET TRANSACTION ISOLATION LEVEL SERIALIZABLE GO
COMMIT
BEGIN TRANSACTION
CREATE TABLE dbo.Adj

	(
 	ID int NOT NULL IDENTITY (1, 1),
	Parent_ID int NULL,
	Bool bit NOT NULL CONSTRAINT DF_Adj_Bool DEFAULT (1),
	Dub float(53) NOT NULL CONSTRAINT DF_Adj_Dub DEFAULT (1.12345678),
	Date datetime NOT NULL CONSTRAINT DF_Adj_Date DEFAULT ('1/1/2001
12:00'),
	Text char(255) NOT NULL CONSTRAINT DF_Adj_Text DEFAULT
('abcdefghijklmnopqrstuvwxyz0123456789')
	) ON [PRIMARY]

GO
ALTER TABLE dbo.Adj ADD CONSTRAINT
	PK_Adj PRIMARY KEY CLUSTERED 
	(
	ID
	) ON [PRIMARY]

GO
COMMIT DECLARE _at_parent_id int

INSERT INTO Adj (Parent_ID) VALUES (NULL) SELECT _at_parent_id = MAX(Parent_ID) FROM Adj

DECLARE _at_count int
SET _at_count = 0

SET NOCOUNT ON WHILE (_at_count < 10000)
BEGIN

	INSERT INTO Adj (Parent_ID) VALUES (_at_parent_id)
	SELECT _at_parent_id = MAX(ID) FROM Adj
	SET _at_count = @count + 1

END SET NOCOUNT OFF GO
-- end create table and data

The traversal SQL that most closely approximates your code is:

  • begin sql 1 DECLARE _at_date_start datetime DECLARE _at_date_end datetime

DECLARE _at_cur_id int
DECLARE _at_parent_id int

DECLARE _at_bool bit
DECLARE _at_dub float
DECLARE _at_date datetime
DECLARE _at_text char(255)

SELECT _at_cur_id = MAX(ID) FROM Adj

SET _at_date_start = GETDATE()

WHILE (_at_cur_id IS NOT NULL)
BEGIN
        SELECT _at_parent_id = Parent_ID, @bool = Bool, @dub = Dub, @date = Date, _at_text = Text

	FROM Adj 
	WHERE ID = _at_cur_id

	SET _at_cur_id = @parent_id

END SET _at_date_end = GETDATE()

SELECT DATEDIFF(ms, _at_date_start, @date_end) -- end sql 1

This SQL runs in approximately 420 ms on a Pentium II 400, SQL Server 7.0/NT 4.0, 256 MB RAM (I think).

Since you don't actually do anything with the intermediate traversal variables, I took the liberty of optimizing the SQL further to do what your benchmark code ultimately does -- namely, retrieve the root node's variables:

DECLARE _at_cur_id int
DECLARE _at_parent_id int

DECLARE _at_bool bit
DECLARE _at_dub float
DECLARE _at_date datetime
DECLARE _at_text char(255)

SELECT _at_cur_id = MAX(ID) FROM Adj

SET _at_date_start = GETDATE()

SELECT _at_cur_id = ID, @bool = Bool, @dub = Dub, @date = Date, @text = Text FROM Adj WHERE ID = (SELECT MIN(ID) FROM Adj)

SET _at_date_end = GETDATE()

SELECT DATEDIFF(ms, _at_date_start, @date_end) -- end sql 2

This code runs in between 0 and 10 ms on the same platform and matches your real-world final state.

jraustin1_at_hotmail.com (James) wrote in message news:<a6e74506.0112261813.742ba3be_at_posting.google.com>...
> The following benchmark deviates from the steps outlined earlier in
> order to simplify it creation. Also posted at
> www.xdb1.com/Benchmark/7.asp
>
> XDb required 16 ms to access 10,000 objects and copy their properites
> to variables starting with the last child. Each object was of a
> different class and each object was the child of the previous object.
> Each object had five properties of following data types: int(objId),
> bool, double, dateTime and string.
>
> This benchmark is similar to the following in a relational database.
> Create 10,000 tables. Each table has one record. Each record has five
> fields plus an additional integer field is needed in a relational db
> to related records. Each record is the child of the record in the
> previous table. Access each record and copy its five properties to
> variables starting with the last child.
>
> Below is the C++ code (edited for clarity) used for the benchmark.
>
> void CXDbView::MenuTools_Benchmark_7_click()
> {
> // Get start time
> // Create objPkg for Test Class
> pTest = pO_m->InstPkg_add_(pRoot_g, // Cls
> pFldrMyData_g, // Parent
> _T("Test")); // Inst name
>
> // Create property pPBool
> pPBool = pO_m->InstPkg_add_(pRoot_g, // Cls
> pFldrMyData_g, // Parent
> _T("PBool")); // Inst name
>
> // Create property pPDbl
> pPDbl = pO_m->InstPkg_add_(pRoot_g, // Cls
> pFldrMyData_g, // Parent
> _T("PDbl")); // Inst name
>
> // Create property pPDate
> pPDate = pO_m->InstPkg_add_(pRoot_g, // Cls
> pFldrMyData_g, // Parent
> _T("PDate")); // Inst name
>
> // Create property pPStr
> pPStr = pO_m->InstPkg_add_(pRoot_g, // Cls
> pFldrMyData_g, // Parent
> _T("PStr")); // Inst name
>
> // Attach properties to Test Cls and set default values
> pO_m->Prop_childFirst_set_(pTest, pPBool,
> kObjType_PropertyVirtual_g,
> _T("True"));
> pO_m->Prop_childFirst_set_(pTest, pPDbl,
> kObjType_PropertyVirtual_g,
> _T("1.2345678"));
> pO_m->Prop_childFirst_set_(pTest, pPDate,
> kObjType_PropertyVirtual_g,
> _T("1/1/2001 12:00 P"));
> pO_m->Prop_childFirst_set_(pTest, pPStr,
> kObjType_PropertyVirtual_g,
>
> _T("abcdefghijklmnopqrstuvwxyz0123456789"));
>
> // Create X objects, each an instances and child of previous
> pNew = pTest;
> const int LOOPS = 10000;
> for (int x=1; x < LOOPS; x++){
> pNew = pO_m->Inst_add_(pNew, // cls
> pNew); // parent
> ASSERT(pNew);
> }
>
> // Get start time
> DWORD timeStart = GetTickCount();
>
> // Loop thru all instances starting from last child
> pPBoolX = NULL;
> pPDblX = NULL;
> pPDateX = NULL;
> pPStrX = NULL;
>
> int id = 0;
> bool bol = false;
> double dbl = 0;
> COleDateTime* dt;
> TCHAR str[kObjTxtLenMax_g+1] = _T("");
>
> pX = pNew;
> while (pX != pTest){
> // Get pX's props and put in local vars
> // Int
> id = pX->id;
>
> // Bool
> pPBoolX = pO_m->Prop_get_(pX, pPBool, false);
> bol = pO_m->DataBool_get_(pPBoolX->childFirst->cls);
>
> // Dbl
> pPDblX = pO_m->Prop_get_(pX, pPDbl, false);
> dbl = pO_m->DataDecimalPrecise_get_(pPDblX->childFirst->cls);
>
> // Date
> pPDateX = pO_m->Prop_get_(pX, pPDate, false);
> dt = pO_m->DataDateTime_get_(pPDateX->childFirst->cls);
>
> // Str
> pPStrX = pO_m->Prop_get_(pX, pPStr, false);
> pO_m->Data_getAsStr_(pPStrX->childFirst->cls, str);
>
> // Move to parent
> pX = pX->parent_();
> }
>
> // Get end time
> DWORD timeEnd = GetTickCount();
>
> // Calc time elapsed
> DWORD timeElapsed = timeEnd - timeStart;
>
> // Async flush memory to disk
> pDb_m->File_save_();
>
> // Display results
> CString sResults;
> sResults.Format(_T("Benchmark 7\n
> Accessed props of %d hierarchal objs in %d
> msec."),
> LOOPS, timeElapsed);
>
> MessageBox(sResults);
> }
Received on Wed Jan 02 2002 - 17:05:30 CET

Original text of this message