Developer Documentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Modules Pages
GPUCacheOptimizer.cc
1 /*===========================================================================*\
2  * *
3  * OpenFlipper *
4  * Copyright (c) 2001-2015, RWTH-Aachen University *
5  * Department of Computer Graphics and Multimedia *
6  * All rights reserved. *
7  * www.openflipper.org *
8  * *
9  *---------------------------------------------------------------------------*
10  * This file is part of OpenFlipper. *
11  *---------------------------------------------------------------------------*
12  * *
13  * Redistribution and use in source and binary forms, with or without *
14  * modification, are permitted provided that the following conditions *
15  * are met: *
16  * *
17  * 1. Redistributions of source code must retain the above copyright notice, *
18  * this list of conditions and the following disclaimer. *
19  * *
20  * 2. Redistributions in binary form must reproduce the above copyright *
21  * notice, this list of conditions and the following disclaimer in the *
22  * documentation and/or other materials provided with the distribution. *
23  * *
24  * 3. Neither the name of the copyright holder nor the names of its *
25  * contributors may be used to endorse or promote products derived from *
26  * this software without specific prior written permission. *
27  * *
28  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
29  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
30  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A *
31  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER *
32  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
33  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
34  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
35  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
36  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
37  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
38  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
39  * *
40 \*===========================================================================*/
41 
42 /*===========================================================================*\
43  * *
44  * $Revision$ *
45  * $Author$ *
46  * $Date$ *
47  * *
48 \*===========================================================================*/
49 
50 //=============================================================================
51 
52 #include "GPUCacheOptimizer.hh"
53 #include <cassert>
54 #include <cmath>
55 #include <vector>
56 #include <cstring>
57 
58 //=============================================================================
59 
60 namespace ACG
61 {
62 
63 //=============================================================================
64 
65 GPUCacheOptimizer::GPUCacheOptimizer( unsigned int NumTris, unsigned int NumVerts, unsigned int IndexSize, const void* pIndices) :
66  m_NumVerts(NumVerts),
67  m_NumTris(NumTris),
68  m_IndexSize(IndexSize),
69  m_pIndices(pIndices),
70  m_NumTransformations(0)
71 {
72  m_pTriMap = new unsigned int[m_NumTris];
73 }
74 
75 GPUCacheOptimizer::~GPUCacheOptimizer(void)
76 {
77  delete [] m_pTriMap;
78 }
79 
80 //=============================================================================
81 
82 unsigned int GPUCacheOptimizer::GetIndex(unsigned int i) const
83 {
84  assert(i < m_NumTris * 3);
85 
86  return GetIndex(i, m_IndexSize, m_pIndices);
87 }
88 
89 unsigned int GPUCacheOptimizer::GetIndex(unsigned int i, unsigned int IndexSize, const void* pIB)
90 {
91  switch (IndexSize)
92  {
93  case 4: return ((const unsigned int*)pIB)[i]; break;
94  case 2: return ((const unsigned short*)pIB)[i]; break;
95  case 1: return ((const unsigned char*)pIB)[i]; break;
96  default:
97  assert(i == 1 || i == 2 || i == 4); // throw error
98  }
99  return 0xFFFFFFFF;
100 }
101 
102 void GPUCacheOptimizer::SetIndex(unsigned int i, unsigned int val, unsigned int IndexSize, void* pIB)
103 {
104  switch (IndexSize)
105  {
106  case 4: ((unsigned int*)pIB)[i] = val; break;
107  case 2: ((unsigned short*)pIB)[i] = val; break;
108  case 1: ((unsigned char*)pIB)[i] = val; break;
109  default:
110  assert(i == 1 || i == 2 || i == 4); // throw error
111  }
112 }
113 
114 //=============================================================================
115 
116 void GPUCacheOptimizer::WriteIndexBuffer(unsigned int DstIndexSize, void* pDst)
117 {
118  assert(DstIndexSize == 1 ||DstIndexSize == 2 || DstIndexSize == 4);
119  // TODO: warning log, if DstIndexSize < m_IndexSize
120 
121  // support for 'in-place' operation via tmpbuf copy
122  char* pSrc = (char*)m_pIndices;
123 
124  int bTmpCopy = 0;
125  if (pDst == pSrc)
126  {
127  pSrc = new char[m_IndexSize * m_NumTris * 3];
128  memcpy(pSrc, m_pIndices, m_IndexSize * m_NumTris * 3);
129 
130  bTmpCopy = 1;
131  }
132 
133  for (unsigned int i = 0; i < m_NumTris; ++i)
134  {
135  for (int k = 0; k < 3; ++k)
136  {
137  unsigned int TriVertex = GetIndex(m_pTriMap[i] * 3 + k, m_IndexSize, pSrc);
138 
139  // copy remapped tri indices
140  SetIndex(i * 3 + k, TriVertex, DstIndexSize, pDst);
141  }
142  }
143 
144  if (bTmpCopy) delete [] pSrc;
145 }
146 
147 //=============================================================================
148 
149 void GPUCacheOptimizer::RemapVertices(unsigned int NumTris, unsigned int NumVerts,
150  const unsigned int* pVertMap,
151  unsigned int IndexSize, void* pInOutIndices,
152  unsigned int VertexStride, void* pInOutVertices)
153 {
154  if (pVertMap && pInOutIndices && pInOutVertices && VertexStride)
155  {
156  // make tmp vertex buffer copy
157  char* pTmpBuf = new char[VertexStride * NumVerts];
158  memcpy(pTmpBuf, pInOutVertices, VertexStride * NumVerts);
159 
160  char* pVertexOut = (char*)pInOutVertices;
161 
162  // apply on vertex buffer
163 
164  for (unsigned int i = 0; i < NumVerts; ++i)
165  {
166  // some mapping destinations might be invalid
167  // this vertex is unused, ignore then
168  if (pVertMap[i] < NumVerts)
169  memcpy(pVertexOut + pVertMap[i] * VertexStride,
170  pTmpBuf + i * VertexStride, VertexStride);
171  }
172 
173  // apply on index buffer
174 
175  for (unsigned int i = 0; i < NumTris * 3; ++i)
176  {
177  // IndexBuffer[i] = VertMap[IndexBuffer[i]]
178 
179  unsigned int v = GetIndex(i, IndexSize, pInOutIndices);
180  SetIndex(i, pVertMap[v], IndexSize, pInOutIndices);
181  }
182 
183  delete [] pTmpBuf;
184  }
185 }
186 
187 //=============================================================================
188 
189 void GPUCacheOptimizer::OptimizeVertices(unsigned int NumTris, unsigned int NumVerts,
190  unsigned int IndexSize, const void* pIndices,
191  unsigned int* pVertMap)
192 {
193  // straight forward algorithm
194  // simply iterate over indices and increment vertex location if unvisited vertex found
195  unsigned int uCounter = 0; // vertex counter
196 
197  memset(pVertMap, 0xFF, NumVerts * sizeof(unsigned int));
198 
199  for (unsigned int i = 0; i < NumTris * 3; ++i)
200  {
201  unsigned int vertex;
202 
203  if (IndexSize == 2) vertex = ((const unsigned short*)pIndices)[i];
204  else vertex = ((const unsigned int*)pIndices)[i];
205 
206  if (pVertMap[vertex] == 0xFFFFFFFF)
207  pVertMap[vertex] = uCounter++;
208  }
209 }
210 
211 //=============================================================================
212 
213 unsigned int GPUCacheOptimizer::ComputeNumberOfVertexTransformations(unsigned int VertexCacheSize)
214 {
215  if (m_NumTransformations) return m_NumTransformations;
216 
217  unsigned int NumIndices = 3 * m_NumTris;
218  if (!NumIndices) return 0;
219 
220  unsigned int* Cache = new unsigned int[VertexCacheSize];
221  unsigned int NumSlotsInUse = 0;
222  unsigned int LastSlot = 0;
223  m_NumTransformations = 0;
224 
225  for (unsigned int i = 0; i < m_NumTris; ++i)
226  {
227  unsigned int t = m_pTriMap[i];
228 
229  // for each vertex of triangle t:
230  for (int k = 0; k < 3; ++k)
231  {
232  unsigned int Idx = GetIndex(t * 3 + k); // vertex index
233 
234  int bInCache = 0;
235  for (unsigned int k = 0; k < NumSlotsInUse && !bInCache; ++k)
236  {
237  if (Cache[k] == Idx) bInCache = 1;
238  }
239 
240  if (!bInCache)
241  {
242  ++m_NumTransformations;
243  if (NumSlotsInUse < VertexCacheSize)
244  {
245  Cache[NumSlotsInUse++] = Idx;
246  ++LastSlot;
247  }
248  else
249  {
250  if (LastSlot == VertexCacheSize) LastSlot = 0;
251  Cache[LastSlot++] = Idx;
252  }
253  }
254  }
255 
256  }
257 
258  delete [] Cache;
259 
260  return m_NumTransformations;
261 }
262 
263 //=============================================================================
264 
265 float GPUCacheOptimizer::ComputeACMR(unsigned int VertexCacheSize)
266 {
267  unsigned int NumT = ComputeNumberOfVertexTransformations(VertexCacheSize);
268  return float(NumT) / float(m_NumTris);
269 }
270 
271 //=============================================================================
272 
273 float GPUCacheOptimizer::ComputeATVR(unsigned int VertexCacheSize)
274 {
275  unsigned int NumT = ComputeNumberOfVertexTransformations(VertexCacheSize);
276  return float(NumT) / float(m_NumVerts);
277 }
278 
279 //=============================================================================
280 
281 // forsyth's score function
282 void GPUCacheOptimizer::Opt_Vertex::FindScore(unsigned int MaxSizeVertexCache)
283 {
284 
285 
286 
287  float fNewScore = -1.0f; // -1 : vertex unused
288  if (iNumTrisLeft > 0)
289  {
290  if (iCachePos < 0) fNewScore = 0.0f; // not in FIFO
291  else
292  {
293 
294  if (iCachePos < 3){ // last tri => fixed score
295 
296  const float LastTriScore = 0.75f;
297  fNewScore = LastTriScore;
298 
299  } else
300  {
301  const float CacheDecayPower = 1.5f;
302 
303  // check for cache_pos < MaxSizeCachePos here..
304  // Points for being high in the cache.
305  const float Scaler = 1.0f / (MaxSizeVertexCache - 3);
306  fNewScore = 1.0f - ( iCachePos - 3 ) * Scaler;
307  fNewScore = powf( fNewScore, CacheDecayPower);
308  }
309  }
310 
311  // Bonus points for having a low number of tris still to
312  // use the vert, so we get rid of lone verts quickly.
313 
314  const float ValenceBoostScale = 2.0f;
315  const float ValenceBoostPower = 0.5f;
316 
317  float ValenceBoost = powf( float(iNumTrisLeft), -float(ValenceBoostPower));
318  fNewScore += ValenceBoostScale * ValenceBoost;
319  }
320 
321  fScore = fNewScore;
322 }
323 
324 void GPUCacheOptimizer::Opt_Vertex::RemoveTriFromList(unsigned int tri)
325 {
326  for (int k = 0; k < iNumTrisLeft; ++k)
327  {
328  // replace tri with last tri in this list
329  if (pTris[k] == tri)
330  {
331  pTris[k] = pTris[iNumTrisLeft-1];
332  break;
333  }
334  }
335  --iNumTrisLeft;
336 }
337 
338 //=============================================================================
339 // tipsify
340 
341 GPUCacheOptimizerTipsify::GPUCacheOptimizerTipsify(unsigned int CacheSize, unsigned int NumTris, unsigned int NumVerts, unsigned int IndexSize, const void *pIndices)
342 : GPUCacheOptimizer(NumTris, NumVerts, IndexSize, pIndices)
343 {
344  // optimization notes:
345  // - cache unfriendly layout of Opt_Vertex: high read/write access cost to Opt_vertex data
346 
347  if (NumVerts < 3 || !NumTris) return;
348 
349  Opt_Vertex* pVerts = new Opt_Vertex[NumVerts];
350  Opt_Tris* pTris = new Opt_Tris[NumTris];
351 
352  // OPTIMIZATION: memset vs constructor initialization: memset - 400 ms, constructor - 770 ms (at 10 mil vertices)
353  memset(pVerts, 0, sizeof(Opt_Vertex) * NumVerts);
354 
355  // build adjacency, same start as in forsyth class
356  for (unsigned int i = 0; i < NumTris; ++i)
357  {
358  // copy vertex indices of this tri
359  Opt_Tris* pThisTri = pTris + i;
360 
361  // copy vertex indices of this tri
362  for (unsigned int k = 0; k < 3; ++k)
363  {
364 
365  const unsigned int idx = GetIndex(i * 3 + k);
366  pThisTri->v[k] = idx;
367 
368  // count # tris per vertex
369  ++pVerts[idx].iNumTrisTotal;
370  }
371  }
372 
373  // OPTIMIZATION: allocate one buffer for the complete vertex adjacency list (instead of allocating a new buffer for each vertex)
374  const unsigned int vertexTriAdjBufSize = NumTris * 3;
375  unsigned int vertexTriAdjBufOffset = 0;
376  unsigned int* vertexTriAdjBuf = new unsigned int[vertexTriAdjBufSize];
377 
378  // create list of tris per vertex
379  for (unsigned int i = 0; i < NumTris; ++i)
380  {
381  // add this tri to per vertex tri list
382  for (int k = 0; k < 3; ++k)
383  {
384  Opt_Vertex* pV = pVerts + pTris[i].v[k];
385  if (!pV->pTris) {
386  pV->pTris = vertexTriAdjBuf + vertexTriAdjBufOffset;
387  vertexTriAdjBufOffset += pV->iNumTrisTotal;
388  }
389 
390  // abuse <numTrisLeft> as temporal up counter
391  // (automatically sums to numTris, exactly what we want)
392  pV->pTris[pV->iNumTrisLeft++] = i;
393 
394  pV->iCachePos = 0;
395  }
396  }
397 
398  // use the cache_pos of the OptFaces_Vertex as the time stamp
399 
400  //=============================================================================
401  // OPTIMIZATION:
402  // push and pop on DeadEndVertexStack greatly increases processing time
403  // -> replace with fixed size ring stack
404  // std::vector<unsigned int> DeadEndVertexStack;
405  // DeadEndVertexStack.reserve(2048);
406  RingStack DeadEndVertexStack(128);
407 
408 
409  int f = 0; // arbitrary starting index (vertex)
410  int iTimeStamp = CacheSize + 1;
411  unsigned int i = 1; // cursor
412 
413  unsigned int numTrisAdded = 0;
414 
415  std::vector<unsigned int> N; // 1-ring of next candidates
416  N.reserve(2048);
417 
418  while (f >= 0)
419  {
420  N.clear();
421 
422  // this vertex
423  Opt_Vertex* pV = pVerts + f;
424 
425  // for each adjacent tri of this vertex
426  for (int m = 0; m < pV->iNumTrisTotal; ++m)
427  {
428  Opt_Tris* pT = pTris + pV->pTris[m];
429 
430  if (!pT->bAdded)
431  {
432  // append
433  m_pTriMap[numTrisAdded++] = pV->pTris[m];
434 
435  for (int k = 0; k < 3; ++k)
436  {
437  // push to cache
438  const unsigned int v = pT->v[k];
439  // DeadEndVertexStack.push_back(pT->v[k]);
440  DeadEndVertexStack.push(v);
441 
442  // insert
443  N.push_back(v);
444 
445  Opt_Vertex* adjV = pVerts + v;
446 
447  --adjV->iNumTrisLeft;
448 
449  if (iTimeStamp - adjV->iCachePos > (int)CacheSize)
450  adjV->iCachePos = iTimeStamp++;
451  }
452  pT->bAdded = 1;
453  }
454  }
455 
456 
457  // select next fanning vertex
458  // Get-Next-Vertex
459  {
460  int n = -1, p = -1; // best candidate and priority
461  for (unsigned int k = 0; k < N.size(); ++k)
462  {
463  // for each vertex in N
464  Opt_Vertex* pV = pVerts + N[k];
465 
466  if (pV->iNumTrisLeft > 0)
467  {
468  // error here in pseudo code:
469  // literal p should be named m here
470  // to find the best vertex
471  int m = 0;
472  if (iTimeStamp - pV->iCachePos + 2 * pV->iNumTrisLeft <= (int)CacheSize)
473  m = iTimeStamp - pV->iCachePos;
474 
475  if (m > p)
476  {
477  p = m;
478  n = N[k];
479  }
480  }
481  }
482 
483  if (n == -1)
484  {
485  // Skip-Dead-End
486  while (DeadEndVertexStack.length() && (n == -1))
487  {
488  // unsigned int d = DeadEndVertexStack.back();
489  // DeadEndVertexStack.pop_back();
490  const unsigned int d = DeadEndVertexStack.pop();
491 
492  if (pVerts[d].iNumTrisLeft > 0)
493  n = d;
494  }
495 
496  while (i+1 < NumVerts && (n == -1))
497  {
498  ++i;
499  if (pVerts[i].iNumTrisLeft > 0)
500  n = i;
501  }
502  }
503 
504  f = n;
505  }
506  }
507 
508  // debugging purpose
509  // int capac = N.capacity();
510  // capac = DeadEndVertexStack.capacity();
511 
512  delete [] vertexTriAdjBuf;
513 
514  delete [] pVerts;
515  delete [] pTris;
516 }
517 
518 //=============================================================================
519 
520 GPUCacheEfficiencyTester::GPUCacheEfficiencyTester(unsigned int NumTris, unsigned int NumVerts,
521  unsigned int IndexSize, const void* pIndices)
522 : GPUCacheOptimizer(NumTris, NumVerts, IndexSize, pIndices)
523 {
524  for (unsigned int i = 0; i < NumTris; ++i) m_pTriMap[i] = i;
525 }
526 
527 //=============================================================================
528 
529 
530 }
Namespace providing different geometric functions concerning angles.
Definition: DBSCANT.cc:51
void FindScore(unsigned int MaxSizeVertexCache)
forsyth's score function
int iNumTrisLeft
tris left to add to final result
int iNumTrisTotal
tris using this vertex
void WriteIndexBuffer(unsigned int DstIndexSize, void *pDst)
Applies the remapping on the initial pIndices constructor's param and stores the result in the given ...
unsigned int v[3]
vertices of this triangle
static void OptimizeVertices(unsigned int NumTris, unsigned int NumVerts, unsigned int IndexSize, const void *pIndices, unsigned int *pVertMap)
Reorders vertex buffer to minimize memory address jumps.
GPUCacheOptimizerTipsify(unsigned int CacheSize, unsigned int NumTris, unsigned int NumVerts, unsigned int IndexSize, const void *pIndices)
The actual computation happens here in this constructor.
float ComputeATVR(unsigned int VertexCacheSize=16)
Measures the efficiency use of the vertex cache. ATVR: Average Transform to Vertex Ratio similar to A...
Simple and fast fixed size stack used in tipsify implementation.
static void RemapVertices(unsigned int NumTris, unsigned int NumVerts, const unsigned int *pVertMap, unsigned int IndexSize, void *pInOutIndices, unsigned int VertexStride, void *pInOutVertices)
Applies the remap table of OptimizeVertices to a vertex and index buffer.
float ComputeACMR(unsigned int VertexCacheSize=16)
Measures the efficiency use of the vertex cache. ACMR: Average Cache Miss Ratio.
unsigned int ComputeNumberOfVertexTransformations(unsigned int VertexCacheSize=16)
Returns the total number of vertex transforms performed with a certain VertexCache.
unsigned int length() const
current stack length