-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy patha00029_source.html
108 lines (106 loc) · 21.1 KB
/
a00029_source.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.14"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<title>tesseract: /usr/src/tesseract-ocr.master/src/arch/dotproductavx.cpp Source File</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */
$(document).ready(initResizable);
/* @license-end */</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectalign" style="padding-left: 0.5em;">
<div id="projectname">tesseract
 <span id="projectnumber">4.0.0-1-g2a2b</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.14 -->
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */
var searchBox = new SearchBox("searchBox", "search",false,'Search');
/* @license-end */
</script>
<script type="text/javascript" src="menudata.js"></script>
<script type="text/javascript" src="menu.js"></script>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */
$(function() {
initMenu('',true,false,'search.php','Search');
$(document).ready(function() { init_search(); });
});
/* @license-end */</script>
<div id="main-nav"></div>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */
$(document).ready(function(){initNavTree('a00029_source.html','');});
/* @license-end */
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">dotproductavx.cpp</div> </div>
</div><!--header-->
<div class="contents">
<a href="a00029.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno"> 1</span> <span class="comment">// File: dotproductavx.cpp</span></div><div class="line"><a name="l00003"></a><span class="lineno"> 3</span> <span class="comment">// Description: Architecture-specific dot-product function.</span></div><div class="line"><a name="l00004"></a><span class="lineno"> 4</span> <span class="comment">// Author: Ray Smith</span></div><div class="line"><a name="l00005"></a><span class="lineno"> 5</span> <span class="comment">// Created: Wed Jul 22 10:48:05 PDT 2015</span></div><div class="line"><a name="l00006"></a><span class="lineno"> 6</span> <span class="comment">//</span></div><div class="line"><a name="l00007"></a><span class="lineno"> 7</span> <span class="comment">// (C) Copyright 2015, Google Inc.</span></div><div class="line"><a name="l00008"></a><span class="lineno"> 8</span> <span class="comment">// Licensed under the Apache License, Version 2.0 (the "License");</span></div><div class="line"><a name="l00009"></a><span class="lineno"> 9</span> <span class="comment">// you may not use this file except in compliance with the License.</span></div><div class="line"><a name="l00010"></a><span class="lineno"> 10</span> <span class="comment">// You may obtain a copy of the License at</span></div><div class="line"><a name="l00011"></a><span class="lineno"> 11</span> <span class="comment">// http://www.apache.org/licenses/LICENSE-2.0</span></div><div class="line"><a name="l00012"></a><span class="lineno"> 12</span> <span class="comment">// Unless required by applicable law or agreed to in writing, software</span></div><div class="line"><a name="l00013"></a><span class="lineno"> 13</span> <span class="comment">// distributed under the License is distributed on an "AS IS" BASIS,</span></div><div class="line"><a name="l00014"></a><span class="lineno"> 14</span> <span class="comment">// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span></div><div class="line"><a name="l00015"></a><span class="lineno"> 15</span> <span class="comment">// See the License for the specific language governing permissions and</span></div><div class="line"><a name="l00016"></a><span class="lineno"> 16</span> <span class="comment">// limitations under the License.</span></div><div class="line"><a name="l00018"></a><span class="lineno"> 18</span> <span class="comment"></span></div><div class="line"><a name="l00019"></a><span class="lineno"> 19</span> <span class="preprocessor">#if !defined(__AVX__)</span></div><div class="line"><a name="l00020"></a><span class="lineno"> 20</span> <span class="comment">// Implementation for non-avx archs.</span></div><div class="line"><a name="l00021"></a><span class="lineno"> 21</span> </div><div class="line"><a name="l00022"></a><span class="lineno"> 22</span> <span class="preprocessor">#include "<a class="code" href="a00032.html">dotproductavx.h</a>"</span></div><div class="line"><a name="l00023"></a><span class="lineno"> 23</span> <span class="preprocessor">#include <cstdio></span></div><div class="line"><a name="l00024"></a><span class="lineno"> 24</span> <span class="preprocessor">#include <cstdlib></span></div><div class="line"><a name="l00025"></a><span class="lineno"> 25</span> </div><div class="line"><a name="l00026"></a><span class="lineno"> 26</span> <span class="keyword">namespace </span><a class="code" href="a01629.html">tesseract</a> {</div><div class="line"><a name="l00027"></a><span class="lineno"><a class="line" href="a01629.html#aa41529fcf80de7ee0e8cc5691fbee53b"> 27</a></span> <span class="keywordtype">double</span> <a class="code" href="a01629.html#aa41529fcf80de7ee0e8cc5691fbee53b">DotProductAVX</a>(<span class="keyword">const</span> <span class="keywordtype">double</span>* u, <span class="keyword">const</span> <span class="keywordtype">double</span>* v, <span class="keywordtype">int</span> n) {</div><div class="line"><a name="l00028"></a><span class="lineno"> 28</span>  fprintf(stderr, <span class="stringliteral">"DotProductAVX can't be used on Android\n"</span>);</div><div class="line"><a name="l00029"></a><span class="lineno"> 29</span>  abort();</div><div class="line"><a name="l00030"></a><span class="lineno"> 30</span> }</div><div class="line"><a name="l00031"></a><span class="lineno"> 31</span> } <span class="comment">// namespace tesseract</span></div><div class="line"><a name="l00032"></a><span class="lineno"> 32</span> </div><div class="line"><a name="l00033"></a><span class="lineno"> 33</span> <span class="preprocessor">#else // !defined(__AVX__)</span></div><div class="line"><a name="l00034"></a><span class="lineno"> 34</span> <span class="comment">// Implementation for avx capable archs.</span></div><div class="line"><a name="l00035"></a><span class="lineno"> 35</span> <span class="preprocessor">#include <immintrin.h></span></div><div class="line"><a name="l00036"></a><span class="lineno"> 36</span> <span class="preprocessor">#include <cstdint></span></div><div class="line"><a name="l00037"></a><span class="lineno"> 37</span> <span class="preprocessor">#include "<a class="code" href="a00032.html">dotproductavx.h</a>"</span></div><div class="line"><a name="l00038"></a><span class="lineno"> 38</span> </div><div class="line"><a name="l00039"></a><span class="lineno"> 39</span> <span class="keyword">namespace </span><a class="code" href="a01629.html">tesseract</a> {</div><div class="line"><a name="l00040"></a><span class="lineno"> 40</span> </div><div class="line"><a name="l00041"></a><span class="lineno"> 41</span> <span class="comment">// Computes and returns the dot product of the n-vectors u and v.</span></div><div class="line"><a name="l00042"></a><span class="lineno"> 42</span> <span class="comment">// Uses Intel AVX intrinsics to access the SIMD instruction set.</span></div><div class="line"><a name="l00043"></a><span class="lineno"> 43</span> <span class="keywordtype">double</span> <a class="code" href="a01629.html#aa41529fcf80de7ee0e8cc5691fbee53b">DotProductAVX</a>(<span class="keyword">const</span> <span class="keywordtype">double</span>* u, <span class="keyword">const</span> <span class="keywordtype">double</span>* v, <span class="keywordtype">int</span> n) {</div><div class="line"><a name="l00044"></a><span class="lineno"> 44</span>  <span class="keywordtype">int</span> max_offset = n - 4;</div><div class="line"><a name="l00045"></a><span class="lineno"> 45</span>  <span class="keywordtype">int</span> offset = 0;</div><div class="line"><a name="l00046"></a><span class="lineno"> 46</span>  <span class="comment">// Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and</span></div><div class="line"><a name="l00047"></a><span class="lineno"> 47</span>  <span class="comment">// v, and multiplying them together in parallel.</span></div><div class="line"><a name="l00048"></a><span class="lineno"> 48</span>  __m256d sum = _mm256_setzero_pd();</div><div class="line"><a name="l00049"></a><span class="lineno"> 49</span>  <span class="keywordflow">if</span> (offset <= max_offset) {</div><div class="line"><a name="l00050"></a><span class="lineno"> 50</span>  offset = 4;</div><div class="line"><a name="l00051"></a><span class="lineno"> 51</span>  <span class="comment">// Aligned load is reputedly faster but requires 32 byte aligned input.</span></div><div class="line"><a name="l00052"></a><span class="lineno"> 52</span>  <span class="keywordflow">if</span> ((reinterpret_cast<uintptr_t>(u) & 31) == 0 &&</div><div class="line"><a name="l00053"></a><span class="lineno"> 53</span>  (reinterpret_cast<uintptr_t>(v) & 31) == 0) {</div><div class="line"><a name="l00054"></a><span class="lineno"> 54</span>  <span class="comment">// Use aligned load.</span></div><div class="line"><a name="l00055"></a><span class="lineno"> 55</span>  __m256d floats1 = _mm256_load_pd(u);</div><div class="line"><a name="l00056"></a><span class="lineno"> 56</span>  __m256d floats2 = _mm256_load_pd(v);</div><div class="line"><a name="l00057"></a><span class="lineno"> 57</span>  <span class="comment">// Multiply.</span></div><div class="line"><a name="l00058"></a><span class="lineno"> 58</span>  sum = _mm256_mul_pd(floats1, floats2);</div><div class="line"><a name="l00059"></a><span class="lineno"> 59</span>  <span class="keywordflow">while</span> (offset <= max_offset) {</div><div class="line"><a name="l00060"></a><span class="lineno"> 60</span>  floats1 = _mm256_load_pd(u + offset);</div><div class="line"><a name="l00061"></a><span class="lineno"> 61</span>  floats2 = _mm256_load_pd(v + offset);</div><div class="line"><a name="l00062"></a><span class="lineno"> 62</span>  offset += 4;</div><div class="line"><a name="l00063"></a><span class="lineno"> 63</span>  __m256d product = _mm256_mul_pd(floats1, floats2);</div><div class="line"><a name="l00064"></a><span class="lineno"> 64</span>  sum = _mm256_add_pd(sum, product);</div><div class="line"><a name="l00065"></a><span class="lineno"> 65</span>  }</div><div class="line"><a name="l00066"></a><span class="lineno"> 66</span>  } <span class="keywordflow">else</span> {</div><div class="line"><a name="l00067"></a><span class="lineno"> 67</span>  <span class="comment">// Use unaligned load.</span></div><div class="line"><a name="l00068"></a><span class="lineno"> 68</span>  __m256d floats1 = _mm256_loadu_pd(u);</div><div class="line"><a name="l00069"></a><span class="lineno"> 69</span>  __m256d floats2 = _mm256_loadu_pd(v);</div><div class="line"><a name="l00070"></a><span class="lineno"> 70</span>  <span class="comment">// Multiply.</span></div><div class="line"><a name="l00071"></a><span class="lineno"> 71</span>  sum = _mm256_mul_pd(floats1, floats2);</div><div class="line"><a name="l00072"></a><span class="lineno"> 72</span>  <span class="keywordflow">while</span> (offset <= max_offset) {</div><div class="line"><a name="l00073"></a><span class="lineno"> 73</span>  floats1 = _mm256_loadu_pd(u + offset);</div><div class="line"><a name="l00074"></a><span class="lineno"> 74</span>  floats2 = _mm256_loadu_pd(v + offset);</div><div class="line"><a name="l00075"></a><span class="lineno"> 75</span>  offset += 4;</div><div class="line"><a name="l00076"></a><span class="lineno"> 76</span>  __m256d product = _mm256_mul_pd(floats1, floats2);</div><div class="line"><a name="l00077"></a><span class="lineno"> 77</span>  sum = _mm256_add_pd(sum, product);</div><div class="line"><a name="l00078"></a><span class="lineno"> 78</span>  }</div><div class="line"><a name="l00079"></a><span class="lineno"> 79</span>  }</div><div class="line"><a name="l00080"></a><span class="lineno"> 80</span>  }</div><div class="line"><a name="l00081"></a><span class="lineno"> 81</span>  <span class="comment">// Add the 4 product sums together horizontally. Not so easy as with sse, as</span></div><div class="line"><a name="l00082"></a><span class="lineno"> 82</span>  <span class="comment">// there is no add across the upper/lower 128 bit boundary, so permute to</span></div><div class="line"><a name="l00083"></a><span class="lineno"> 83</span>  <span class="comment">// move the upper 128 bits to lower in another register.</span></div><div class="line"><a name="l00084"></a><span class="lineno"> 84</span>  __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);</div><div class="line"><a name="l00085"></a><span class="lineno"> 85</span>  sum = _mm256_hadd_pd(sum, sum2);</div><div class="line"><a name="l00086"></a><span class="lineno"> 86</span>  sum = _mm256_hadd_pd(sum, sum);</div><div class="line"><a name="l00087"></a><span class="lineno"> 87</span>  <span class="keywordtype">double</span> result;</div><div class="line"><a name="l00088"></a><span class="lineno"> 88</span>  <span class="comment">// _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse</span></div><div class="line"><a name="l00089"></a><span class="lineno"> 89</span>  <span class="comment">// instruction, as that introduces a 70 cycle delay. All this casting is to</span></div><div class="line"><a name="l00090"></a><span class="lineno"> 90</span>  <span class="comment">// fool the intrinsics into thinking we are extracting the bottom int64.</span></div><div class="line"><a name="l00091"></a><span class="lineno"> 91</span>  <span class="keyword">auto</span> cast_sum = _mm256_castpd_si256(sum);</div><div class="line"><a name="l00092"></a><span class="lineno"> 92</span> <span class="preprocessor">#pragma GCC diagnostic push</span></div><div class="line"><a name="l00093"></a><span class="lineno"> 93</span> <span class="preprocessor">#pragma GCC diagnostic ignored "-Wstrict-aliasing"</span></div><div class="line"><a name="l00094"></a><span class="lineno"> 94</span>  *(<span class="keyword">reinterpret_cast<</span>int64_t*<span class="keyword">></span>(&result)) =</div><div class="line"><a name="l00095"></a><span class="lineno"> 95</span> <span class="preprocessor">#if defined(_WIN32) || defined(__i386__)</span></div><div class="line"><a name="l00096"></a><span class="lineno"> 96</span>  <span class="comment">// This is a very simple workaround that is activated</span></div><div class="line"><a name="l00097"></a><span class="lineno"> 97</span>  <span class="comment">// for all platforms that do not have _mm256_extract_epi64.</span></div><div class="line"><a name="l00098"></a><span class="lineno"> 98</span>  <span class="comment">// _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y]</span></div><div class="line"><a name="l00099"></a><span class="lineno"> 99</span>  ((uint64_t*)&cast_sum)[0]</div><div class="line"><a name="l00100"></a><span class="lineno"> 100</span> <span class="preprocessor">#else</span></div><div class="line"><a name="l00101"></a><span class="lineno"> 101</span>  _mm256_extract_epi64(cast_sum, 0)</div><div class="line"><a name="l00102"></a><span class="lineno"> 102</span> <span class="preprocessor">#endif</span></div><div class="line"><a name="l00103"></a><span class="lineno"> 103</span>  ;</div><div class="line"><a name="l00104"></a><span class="lineno"> 104</span> <span class="preprocessor">#pragma GCC diagnostic pop</span></div><div class="line"><a name="l00105"></a><span class="lineno"> 105</span>  <span class="keywordflow">while</span> (offset < n) {</div><div class="line"><a name="l00106"></a><span class="lineno"> 106</span>  result += u[offset] * v[offset];</div><div class="line"><a name="l00107"></a><span class="lineno"> 107</span>  ++offset;</div><div class="line"><a name="l00108"></a><span class="lineno"> 108</span>  }</div><div class="line"><a name="l00109"></a><span class="lineno"> 109</span>  <span class="keywordflow">return</span> result;</div><div class="line"><a name="l00110"></a><span class="lineno"> 110</span> }</div><div class="line"><a name="l00111"></a><span class="lineno"> 111</span> </div><div class="line"><a name="l00112"></a><span class="lineno"> 112</span> } <span class="comment">// namespace tesseract.</span></div><div class="line"><a name="l00113"></a><span class="lineno"> 113</span> </div><div class="line"><a name="l00114"></a><span class="lineno"> 114</span> <span class="preprocessor">#endif // ANDROID_BUILD</span></div><div class="ttc" id="a01629_html_aa41529fcf80de7ee0e8cc5691fbee53b"><div class="ttname"><a href="a01629.html#aa41529fcf80de7ee0e8cc5691fbee53b">tesseract::DotProductAVX</a></div><div class="ttdeci">double DotProductAVX(const double *u, const double *v, int n)</div><div class="ttdef"><b>Definition:</b> <a href="a00029_source.html#l00027">dotproductavx.cpp:27</a></div></div>
<div class="ttc" id="a00032_html"><div class="ttname"><a href="a00032.html">dotproductavx.h</a></div></div>
<div class="ttc" id="a01629_html"><div class="ttname"><a href="a01629.html">tesseract</a></div><div class="ttdef"><b>Definition:</b> <a href="a00005_source.html#l00094">baseapi.cpp:94</a></div></div>
</div><!-- fragment --></div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="navelem"><a class="el" href="dir_fce9a394c0d2c636e610a0a34fe30580.html">tesseract-ocr.master</a></li><li class="navelem"><a class="el" href="dir_fd2783e80b2d56815818e17a68fc4d98.html">src</a></li><li class="navelem"><a class="el" href="dir_112254772d44b3d8efe7783321f69b12.html">arch</a></li><li class="navelem"><a class="el" href="a00029.html">dotproductavx.cpp</a></li>
<li class="footer">Generated on Mon Oct 29 2018 11:03:42 for tesseract by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li>
</ul>
</div>
</body>
</html>