ViennaCL - The Vienna Computing Library  1.5.1
vector_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <cmath>
26 #include "viennacl/forwards.h"
27 #include "viennacl/scalar.hpp"
28 #include "viennacl/tools/tools.hpp"
31 #include "viennacl/traits/size.hpp"
34 
35 namespace viennacl
36 {
37  namespace linalg
38  {
39  namespace cuda
40  {
41 
42  //
43  // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
44  //
45 
46 
48 
49  // gpu scalar
50  template <typename T>
51  __global__ void av_kernel(T * vec1,
52  unsigned int start1,
53  unsigned int inc1,
54  unsigned int size1,
55 
56  const T * fac2,
57  unsigned int options2,
58  const T * vec2,
59  unsigned int start2,
60  unsigned int inc2)
61  {
62  T alpha = *fac2;
63  if (options2 & (1 << 0))
64  alpha = -alpha;
65 
66  if (options2 & (1 << 1))
67  {
68  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
69  i < size1;
70  i += gridDim.x * blockDim.x)
71  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
72  }
73  else
74  {
75  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
76  i < size1;
77  i += gridDim.x * blockDim.x)
78  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
79  }
80  }
81 
82  // cpu scalar
83  template <typename T>
84  __global__ void av_kernel(T * vec1,
85  unsigned int start1,
86  unsigned int inc1,
87  unsigned int size1,
88 
89  T fac2,
90  unsigned int options2,
91  const T * vec2,
92  unsigned int start2,
93  unsigned int inc2)
94  {
95  T alpha = fac2;
96  if (options2 & (1 << 0))
97  alpha = -alpha;
98 
99  if (options2 & (1 << 1))
100  {
101  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
102  i < size1;
103  i += gridDim.x * blockDim.x)
104  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
105  }
106  else
107  {
108  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
109  i < size1;
110  i += gridDim.x * blockDim.x)
111  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
112  }
113  }
114 
115 
116 
117  template <typename T, typename ScalarType1>
118  void av(vector_base<T> & vec1,
119  vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
120  {
121  typedef T value_type;
122 
123  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
124 
125  value_type data_alpha = alpha;
126  if (flip_sign_alpha)
127  data_alpha = -data_alpha;
128  if (reciprocal_alpha)
129  data_alpha = static_cast<value_type>(1) / data_alpha;
130 
131  value_type temporary_alpha = 0;
133  temporary_alpha = alpha;
134 
135  av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
136  static_cast<unsigned int>(viennacl::traits::start(vec1)),
137  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
138  static_cast<unsigned int>(viennacl::traits::size(vec1)),
139 
140  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
141  options_alpha,
142  detail::cuda_arg<value_type>(vec2),
143  static_cast<unsigned int>(viennacl::traits::start(vec2)),
144  static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
145  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
146  }
147 
148 
150 
151  // alpha and beta on GPU
152  template <typename T>
153  __global__ void avbv_kernel(T * vec1,
154  unsigned int start1,
155  unsigned int inc1,
156  unsigned int size1,
157 
158  const T * fac2,
159  unsigned int options2,
160  const T * vec2,
161  unsigned int start2,
162  unsigned int inc2,
163 
164  const T * fac3,
165  unsigned int options3,
166  const T * vec3,
167  unsigned int start3,
168  unsigned int inc3)
169  {
170  T alpha = *fac2;
171  if (options2 & (1 << 0))
172  alpha = -alpha;
173 
174  T beta = *fac3;
175  if (options3 & (1 << 0))
176  beta = -beta;
177 
178  if (options2 & (1 << 1))
179  {
180  if (options3 & (1 << 1))
181  {
182  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
183  i < size1;
184  i += gridDim.x * blockDim.x)
185  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
186  }
187  else
188  {
189  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
190  i < size1;
191  i += gridDim.x * blockDim.x)
192  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
193  }
194  }
195  else
196  {
197  if (options3 & (1 << 1))
198  {
199  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
200  i < size1;
201  i += gridDim.x * blockDim.x)
202  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
203  }
204  else
205  {
206  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
207  i < size1;
208  i += gridDim.x * blockDim.x)
209  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
210  }
211  }
212  }
213 
214  // alpha on CPU, beta on GPU
215  template <typename T>
216  __global__ void avbv_kernel(T * vec1,
217  unsigned int start1,
218  unsigned int inc1,
219  unsigned int size1,
220 
221  T fac2,
222  unsigned int options2,
223  const T * vec2,
224  unsigned int start2,
225  unsigned int inc2,
226 
227  const T * fac3,
228  unsigned int options3,
229  const T * vec3,
230  unsigned int start3,
231  unsigned int inc3)
232  {
233  T alpha = fac2;
234  if (options2 & (1 << 0))
235  alpha = -alpha;
236 
237  T beta = *fac3;
238  if (options3 & (1 << 0))
239  beta = -beta;
240 
241  if (options2 & (1 << 1))
242  {
243  if (options3 & (1 << 1))
244  {
245  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
246  i < size1;
247  i += gridDim.x * blockDim.x)
248  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
249  }
250  else
251  {
252  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
253  i < size1;
254  i += gridDim.x * blockDim.x)
255  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
256  }
257  }
258  else
259  {
260  if (options3 & (1 << 1))
261  {
262  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
263  i < size1;
264  i += gridDim.x * blockDim.x)
265  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
266  }
267  else
268  {
269  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
270  i < size1;
271  i += gridDim.x * blockDim.x)
272  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
273  }
274  }
275  }
276 
277  // alpha on GPU, beta on CPU
278  template <typename T>
279  __global__ void avbv_kernel(T * vec1,
280  unsigned int start1,
281  unsigned int inc1,
282  unsigned int size1,
283 
284  const T * fac2,
285  unsigned int options2,
286  const T * vec2,
287  unsigned int start2,
288  unsigned int inc2,
289 
290  T fac3,
291  unsigned int options3,
292  const T * vec3,
293  unsigned int start3,
294  unsigned int inc3)
295  {
296  T alpha = *fac2;
297  if (options2 & (1 << 0))
298  alpha = -alpha;
299 
300  T beta = fac3;
301  if (options3 & (1 << 0))
302  beta = -beta;
303 
304  if (options2 & (1 << 1))
305  {
306  if (options3 & (1 << 1))
307  {
308  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
309  i < size1;
310  i += gridDim.x * blockDim.x)
311  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
312  }
313  else
314  {
315  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
316  i < size1;
317  i += gridDim.x * blockDim.x)
318  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
319  }
320  }
321  else
322  {
323  if (options3 & (1 << 1))
324  {
325  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
326  i < size1;
327  i += gridDim.x * blockDim.x)
328  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
329  }
330  else
331  {
332  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
333  i < size1;
334  i += gridDim.x * blockDim.x)
335  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
336  }
337  }
338  }
339 
340  // alpha and beta on CPU
341  template <typename T>
342  __global__ void avbv_kernel(T * vec1,
343  unsigned int start1,
344  unsigned int inc1,
345  unsigned int size1,
346 
347  T fac2,
348  unsigned int options2,
349  const T * vec2,
350  unsigned int start2,
351  unsigned int inc2,
352 
353  T fac3,
354  unsigned int options3,
355  const T * vec3,
356  unsigned int start3,
357  unsigned int inc3)
358  {
359  T alpha = fac2;
360  if (options2 & (1 << 0))
361  alpha = -alpha;
362 
363  T beta = fac3;
364  if (options3 & (1 << 0))
365  beta = -beta;
366 
367  if (options2 & (1 << 1))
368  {
369  if (options3 & (1 << 1))
370  {
371  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
372  i < size1;
373  i += gridDim.x * blockDim.x)
374  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
375  }
376  else
377  {
378  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
379  i < size1;
380  i += gridDim.x * blockDim.x)
381  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
382  }
383  }
384  else
385  {
386  if (options3 & (1 << 1))
387  {
388  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
389  i < size1;
390  i += gridDim.x * blockDim.x)
391  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
392  }
393  else
394  {
395  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
396  i < size1;
397  i += gridDim.x * blockDim.x)
398  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
399  }
400  }
401  }
402 
403 
404 
405 
406  template <typename T, typename ScalarType1, typename ScalarType2>
407  void avbv(vector_base<T> & vec1,
408  vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
409  vector_base<T> const & vec3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
410  {
411  typedef T value_type;
412 
413  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
414 
415  value_type data_alpha = alpha;
416  if (flip_sign_alpha)
417  data_alpha = -data_alpha;
418  if (reciprocal_alpha)
419  data_alpha = static_cast<value_type>(1) / data_alpha;
420 
421  value_type temporary_alpha = 0;
423  temporary_alpha = alpha;
424 
425  unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
426 
427  value_type temporary_beta = 0;
429  temporary_beta = beta;
430 
431 
432  avbv_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
433  static_cast<unsigned int>(viennacl::traits::start(vec1)),
434  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
435  static_cast<unsigned int>(viennacl::traits::size(vec1)),
436 
437  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
438  options_alpha,
439  detail::cuda_arg<value_type>(vec2),
440  static_cast<unsigned int>(viennacl::traits::start(vec2)),
441  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
442 
443  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
444  options_beta,
445  detail::cuda_arg<value_type>(vec3),
446  static_cast<unsigned int>(viennacl::traits::start(vec3)),
447  static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
448  VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_kernel");
449  }
450 
451 
453 
454 
455  // alpha and beta on GPU
456  template <typename T>
457  __global__ void avbv_v_kernel(T * vec1,
458  unsigned int start1,
459  unsigned int inc1,
460  unsigned int size1,
461 
462  const T * fac2,
463  unsigned int options2,
464  const T * vec2,
465  unsigned int start2,
466  unsigned int inc2,
467 
468  const T * fac3,
469  unsigned int options3,
470  const T * vec3,
471  unsigned int start3,
472  unsigned int inc3)
473  {
474  T alpha = *fac2;
475  if (options2 & (1 << 0))
476  alpha = -alpha;
477 
478  T beta = *fac3;
479  if (options3 & (1 << 0))
480  beta = -beta;
481 
482  if (options2 & (1 << 1))
483  {
484  if (options3 & (1 << 1))
485  {
486  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
487  i < size1;
488  i += gridDim.x * blockDim.x)
489  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
490  }
491  else
492  {
493  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
494  i < size1;
495  i += gridDim.x * blockDim.x)
496  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
497  }
498  }
499  else
500  {
501  if (options3 & (1 << 1))
502  {
503  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
504  i < size1;
505  i += gridDim.x * blockDim.x)
506  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
507  }
508  else
509  {
510  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
511  i < size1;
512  i += gridDim.x * blockDim.x)
513  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
514  }
515  }
516  }
517 
518  // alpha on CPU, beta on GPU
519  template <typename T>
520  __global__ void avbv_v_kernel(T * vec1,
521  unsigned int start1,
522  unsigned int inc1,
523  unsigned int size1,
524 
525  T fac2,
526  unsigned int options2,
527  const T * vec2,
528  unsigned int start2,
529  unsigned int inc2,
530 
531  const T * fac3,
532  unsigned int options3,
533  const T * vec3,
534  unsigned int start3,
535  unsigned int inc3)
536  {
537  T alpha = fac2;
538  if (options2 & (1 << 0))
539  alpha = -alpha;
540 
541  T beta = *fac3;
542  if (options3 & (1 << 0))
543  beta = -beta;
544 
545  if (options2 & (1 << 1))
546  {
547  if (options3 & (1 << 1))
548  {
549  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
550  i < size1;
551  i += gridDim.x * blockDim.x)
552  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
553  }
554  else
555  {
556  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
557  i < size1;
558  i += gridDim.x * blockDim.x)
559  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
560  }
561  }
562  else
563  {
564  if (options3 & (1 << 1))
565  {
566  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
567  i < size1;
568  i += gridDim.x * blockDim.x)
569  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
570  }
571  else
572  {
573  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
574  i < size1;
575  i += gridDim.x * blockDim.x)
576  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
577  }
578  }
579  }
580 
581  // alpha on GPU, beta on CPU
582  template <typename T>
583  __global__ void avbv_v_kernel(T * vec1,
584  unsigned int start1,
585  unsigned int inc1,
586  unsigned int size1,
587 
588  const T * fac2,
589  unsigned int options2,
590  const T * vec2,
591  unsigned int start2,
592  unsigned int inc2,
593 
594  T fac3,
595  unsigned int options3,
596  const T * vec3,
597  unsigned int start3,
598  unsigned int inc3)
599  {
600  T alpha = *fac2;
601  if (options2 & (1 << 0))
602  alpha = -alpha;
603 
604  T beta = fac3;
605  if (options3 & (1 << 0))
606  beta = -beta;
607 
608  if (options2 & (1 << 1))
609  {
610  if (options3 & (1 << 1))
611  {
612  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
613  i < size1;
614  i += gridDim.x * blockDim.x)
615  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
616  }
617  else
618  {
619  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
620  i < size1;
621  i += gridDim.x * blockDim.x)
622  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
623  }
624  }
625  else
626  {
627  if (options3 & (1 << 1))
628  {
629  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
630  i < size1;
631  i += gridDim.x * blockDim.x)
632  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
633  }
634  else
635  {
636  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
637  i < size1;
638  i += gridDim.x * blockDim.x)
639  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
640  }
641  }
642  }
643 
644  // alpha and beta on CPU
645  template <typename T>
646  __global__ void avbv_v_kernel(T * vec1,
647  unsigned int start1,
648  unsigned int inc1,
649  unsigned int size1,
650 
651  T fac2,
652  unsigned int options2,
653  const T * vec2,
654  unsigned int start2,
655  unsigned int inc2,
656 
657  T fac3,
658  unsigned int options3,
659  const T * vec3,
660  unsigned int start3,
661  unsigned int inc3)
662  {
663  T alpha = fac2;
664  if (options2 & (1 << 0))
665  alpha = -alpha;
666 
667  T beta = fac3;
668  if (options3 & (1 << 0))
669  beta = -beta;
670 
671  if (options2 & (1 << 1))
672  {
673  if (options3 & (1 << 1))
674  {
675  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
676  i < size1;
677  i += gridDim.x * blockDim.x)
678  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
679  }
680  else
681  {
682  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
683  i < size1;
684  i += gridDim.x * blockDim.x)
685  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
686  }
687  }
688  else
689  {
690  if (options3 & (1 << 1))
691  {
692  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
693  i < size1;
694  i += gridDim.x * blockDim.x)
695  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
696  }
697  else
698  {
699  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
700  i < size1;
701  i += gridDim.x * blockDim.x)
702  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
703  }
704  }
705  }
706 
707 
708  template <typename T, typename ScalarType1, typename ScalarType2>
709  void avbv_v(vector_base<T> & vec1,
710  vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
711  vector_base<T> const & vec3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
712  {
713  typedef T value_type;
714 
715  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
716 
717  value_type data_alpha = alpha;
718  if (flip_sign_alpha)
719  data_alpha = -data_alpha;
720  if (reciprocal_alpha)
721  data_alpha = static_cast<value_type>(1) / data_alpha;
722 
723  value_type temporary_alpha = 0;
725  temporary_alpha = alpha;
726 
727  unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
728 
729  value_type temporary_beta = 0;
731  temporary_beta = beta;
732 
733 
734  avbv_v_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
735  static_cast<unsigned int>(viennacl::traits::start(vec1)),
736  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
737  static_cast<unsigned int>(viennacl::traits::size(vec1)),
738 
739  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
740  options_alpha,
741  detail::cuda_arg<value_type>(vec2),
742  static_cast<unsigned int>(viennacl::traits::start(vec2)),
743  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
744 
745  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
746  options_beta,
747  detail::cuda_arg<value_type>(vec3),
748  static_cast<unsigned int>(viennacl::traits::start(vec3)),
749  static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
750  }
751 
752 
754 
755  template <typename T>
756  __global__ void vector_assign_kernel(T * vec1,
757  unsigned int start1,
758  unsigned int inc1,
759  unsigned int size1,
760  unsigned int internal_size1,
761 
762  T alpha)
763  {
764  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
765  i < size1;
766  i += gridDim.x * blockDim.x)
767  vec1[i*inc1+start1] = (i < size1) ? alpha : 0;
768  }
769 
776  template <typename T, typename S1>
777  void vector_assign(vector_base<T> & vec1, const S1 & alpha, bool up_to_internal_size = false)
778  {
779  typedef T value_type;
780 
781  value_type temporary_alpha = 0;
783  temporary_alpha = alpha;
784 
785  unsigned int size = up_to_internal_size ? static_cast<unsigned int>(vec1.internal_size()) : static_cast<unsigned int>(viennacl::traits::size(vec1));
786 
787  vector_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
788  static_cast<unsigned int>(viennacl::traits::start(vec1)),
789  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
790  size,
791  static_cast<unsigned int>(vec1.internal_size()), //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
792 
793  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)) );
794  VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_v_kernel");
795  }
796 
798 
799  template <typename T>
800  __global__ void vector_swap_kernel(T * vec1,
801  unsigned int start1,
802  unsigned int inc1,
803  unsigned int size1,
804 
805  T * vec2,
806  unsigned int start2,
807  unsigned int inc2)
808  {
809  T tmp;
810  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
811  i < size1;
812  i += gridDim.x * blockDim.x)
813  {
814  tmp = vec2[i*inc2+start2];
815  vec2[i*inc2+start2] = vec1[i*inc1+start1];
816  vec1[i*inc1+start1] = tmp;
817  }
818  }
819 
820 
826  template <typename T>
828  {
829  typedef T value_type;
830 
831  vector_swap_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
832  static_cast<unsigned int>(viennacl::traits::start(vec1)),
833  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
834  static_cast<unsigned int>(viennacl::traits::size(vec1)),
835 
836  detail::cuda_arg<value_type>(vec2),
837  static_cast<unsigned int>(viennacl::traits::start(vec2)),
838  static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
839  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_swap_kernel");
840  }
841 
843 
844  template <typename T>
845  __global__ void element_op_kernel(T * vec1,
846  unsigned int start1,
847  unsigned int inc1,
848  unsigned int size1,
849 
850  T const * vec2,
851  unsigned int start2,
852  unsigned int inc2,
853 
854  T const * vec3,
855  unsigned int start3,
856  unsigned int inc3,
857 
858  unsigned int op_type
859  )
860  {
861  if (op_type == 2)
862  {
863  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
864  i < size1;
865  i += gridDim.x * blockDim.x)
866  {
867  vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
868  }
869  }
870  else if (op_type == 1)
871  {
872  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
873  i < size1;
874  i += gridDim.x * blockDim.x)
875  {
876  vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
877  }
878  }
879  else if (op_type == 0)
880  {
881  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
882  i < size1;
883  i += gridDim.x * blockDim.x)
884  {
885  vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
886  }
887  }
888  }
889 
890  template <typename T>
891  __global__ void element_op_int_kernel(T * vec1,
892  unsigned int start1,
893  unsigned int inc1,
894  unsigned int size1,
895 
896  T const * vec2,
897  unsigned int start2,
898  unsigned int inc2,
899 
900  T const * vec3,
901  unsigned int start3,
902  unsigned int inc3,
903 
904  unsigned int op_type
905  )
906  {
907  if (op_type == 1)
908  {
909  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
910  i < size1;
911  i += gridDim.x * blockDim.x)
912  {
913  vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
914  }
915  }
916  else if (op_type == 0)
917  {
918  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
919  i < size1;
920  i += gridDim.x * blockDim.x)
921  {
922  vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
923  }
924  }
925  }
926 
932  template <typename T, typename OP>
935  {
936  typedef T value_type;
937 
938  unsigned int op_type = 2; //0: product, 1: division, 2: power
940  op_type = 1;
942  op_type = 0;
943 
944  element_op_int_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
945  static_cast<unsigned int>(viennacl::traits::start(vec1)),
946  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
947  static_cast<unsigned int>(viennacl::traits::size(vec1)),
948 
949  detail::cuda_arg<value_type>(proxy.lhs()),
950  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
951  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
952 
953  detail::cuda_arg<value_type>(proxy.rhs()),
954  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
955  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
956 
957  op_type
958  );
959  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
960  }
961 
962  template <typename OP>
965  {
966  typedef float value_type;
967 
968  unsigned int op_type = 2; //0: product, 1: division, 2: power
970  op_type = 1;
972  op_type = 0;
973 
974  element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
975  static_cast<unsigned int>(viennacl::traits::start(vec1)),
976  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
977  static_cast<unsigned int>(viennacl::traits::size(vec1)),
978 
979  detail::cuda_arg<value_type>(proxy.lhs()),
980  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
981  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
982 
983  detail::cuda_arg<value_type>(proxy.rhs()),
984  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
985  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
986 
987  op_type
988  );
989  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
990  }
991 
992  template <typename OP>
995  {
996  typedef double value_type;
997 
998  unsigned int op_type = 2; //0: product, 1: division, 2: power
1000  op_type = 1;
1002  op_type = 0;
1003 
1004  element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1005  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1006  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1007  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1008 
1009  detail::cuda_arg<value_type>(proxy.lhs()),
1010  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1011  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
1012 
1013  detail::cuda_arg<value_type>(proxy.rhs()),
1014  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
1015  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
1016 
1017  op_type
1018  );
1019  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
1020  }
1021 
1023 
1024 // Note: Trying to automate things with macros or template metaprogramming failed (preprocessor with nvcc did not work as expected), so this is terribly hand-rolled code
1025 // Question (Karl Rupp): Why is CUDA code always such a hassle when trying to use it in a library context?
1026 
1027  // acos
1028  template <typename T> __global__ void vec_element_acos_kernel(
1029  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1030  T const * vec2, unsigned int start2, unsigned int inc2)
1031  {
1032  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1033  vec1[i*inc1+start1] = acos(vec2[i*inc2+start2]);
1034  }
1035 
1036  template <typename T>
1039  {
1040  typedef T value_type;
1041 
1042  vec_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1043  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1044  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1045  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1046  detail::cuda_arg<value_type>(proxy.lhs()),
1047  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1048  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1049  );
1050  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_acos_kernel");
1051  }
1052 
1053  // asin
1054  template <typename T> __global__ void vec_element_asin_kernel(
1055  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1056  T const * vec2, unsigned int start2, unsigned int inc2)
1057  {
1058  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1059  vec1[i*inc1+start1] = asin(vec2[i*inc2+start2]);
1060  }
1061 
1062  template <typename T>
1065  {
1066  typedef T value_type;
1067 
1068  vec_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1069  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1070  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1071  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1072  detail::cuda_arg<value_type>(proxy.lhs()),
1073  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1074  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1075  );
1076  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_asin_kernel");
1077  }
1078 
1079 
1080  // atan
1081  template <typename T> __global__ void vec_element_atan_kernel(
1082  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1083  T const * vec2, unsigned int start2, unsigned int inc2)
1084  {
1085  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1086  vec1[i*inc1+start1] = atan(vec2[i*inc2+start2]);
1087  }
1088 
1089  template <typename T>
1092  {
1093  typedef T value_type;
1094 
1095  vec_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1096  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1097  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1098  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1099  detail::cuda_arg<value_type>(proxy.lhs()),
1100  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1101  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1102  );
1103  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_atan_kernel");
1104  }
1105 
1106 
1107  // ceil
1108  template <typename T> __global__ void vec_element_ceil_kernel(
1109  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1110  T const * vec2, unsigned int start2, unsigned int inc2)
1111  {
1112  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1113  vec1[i*inc1+start1] = ceil(vec2[i*inc2+start2]);
1114  }
1115 
1116  template <typename T>
1119  {
1120  typedef T value_type;
1121 
1122  vec_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1123  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1124  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1125  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1126  detail::cuda_arg<value_type>(proxy.lhs()),
1127  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1128  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1129  );
1130  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_ceil_kernel");
1131  }
1132 
1133 
1134  // cos
1135  template <typename T> __global__ void vec_element_cos_kernel(
1136  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1137  T const * vec2, unsigned int start2, unsigned int inc2)
1138  {
1139  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1140  vec1[i*inc1+start1] = cos(vec2[i*inc2+start2]);
1141  }
1142 
1143  template <typename T>
1146  {
1147  typedef T value_type;
1148 
1149  vec_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1150  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1151  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1152  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1153  detail::cuda_arg<value_type>(proxy.lhs()),
1154  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1155  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1156  );
1157  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cos_kernel");
1158  }
1159 
1160 
1161  // cosh
1162  template <typename T> __global__ void vec_element_cosh_kernel(
1163  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1164  T const * vec2, unsigned int start2, unsigned int inc2)
1165  {
1166  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1167  vec1[i*inc1+start1] = cosh(vec2[i*inc2+start2]);
1168  }
1169 
1170  template <typename T>
1173  {
1174  typedef T value_type;
1175 
1176  vec_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1177  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1178  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1179  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1180  detail::cuda_arg<value_type>(proxy.lhs()),
1181  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1182  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1183  );
1184  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cosh_kernel");
1185  }
1186 
1187 
1188  // exp
1189  template <typename T> __global__ void vec_element_exp_kernel(
1190  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1191  T const * vec2, unsigned int start2, unsigned int inc2)
1192  {
1193  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1194  vec1[i*inc1+start1] = exp(vec2[i*inc2+start2]);
1195  }
1196 
1197  template <typename T>
1200  {
1201  typedef T value_type;
1202 
1203  vec_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1204  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1205  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1206  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1207  detail::cuda_arg<value_type>(proxy.lhs()),
1208  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1209  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1210  );
1211  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_exp_kernel");
1212  }
1213 
1214 
1215  // fabs
1216  template <typename T> __global__ void vec_element_fabs_kernel(
1217  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1218  T const * vec2, unsigned int start2, unsigned int inc2)
1219  {
1220  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1221  vec1[i*inc1+start1] = fabs(vec2[i*inc2+start2]);
1222  }
1223 
1224  template <typename T>
1227  {
1228  typedef T value_type;
1229 
1230  vec_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1231  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1232  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1233  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1234  detail::cuda_arg<value_type>(proxy.lhs()),
1235  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1236  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1237  );
1238  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_fabs_kernel");
1239  }
1240 
1241  // abs
1242  template <typename T> __global__ void vec_element_abs_kernel(
1243  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1244  T const * vec2, unsigned int start2, unsigned int inc2)
1245  {
1246  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1247  vec1[i*inc1+start1] = abs(vec2[i*inc2+start2]);
1248  }
1249 
1250  template <typename T>
1253  {
1254  typedef T value_type;
1255 
1256  vec_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1257  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1258  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1259  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1260  detail::cuda_arg<value_type>(proxy.lhs()),
1261  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1262  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1263  );
1264  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_abs_kernel");
1265  }
1266 
1267 
1268 
1269  // floor
1270  template <typename T> __global__ void vec_element_floor_kernel(
1271  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1272  T const * vec2, unsigned int start2, unsigned int inc2)
1273  {
1274  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1275  vec1[i*inc1+start1] = floor(vec2[i*inc2+start2]);
1276  }
1277 
1278  template <typename T>
1281  {
1282  typedef T value_type;
1283 
1284  vec_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1285  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1286  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1287  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1288  detail::cuda_arg<value_type>(proxy.lhs()),
1289  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1290  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1291  );
1292  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_floor_kernel");
1293  }
1294 
1295 
1296  // log
1297  template <typename T> __global__ void vec_element_log_kernel(
1298  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1299  T const * vec2, unsigned int start2, unsigned int inc2)
1300  {
1301  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1302  vec1[i*inc1+start1] = log(vec2[i*inc2+start2]);
1303  }
1304 
1305  template <typename T>
1308  {
1309  typedef T value_type;
1310 
1311  vec_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1312  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1313  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1314  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1315  detail::cuda_arg<value_type>(proxy.lhs()),
1316  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1317  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1318  );
1319  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log_kernel");
1320  }
1321 
1322 
1323  // log10
1324  template <typename T> __global__ void vec_element_log10_kernel(
1325  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1326  T const * vec2, unsigned int start2, unsigned int inc2)
1327  {
1328  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1329  vec1[i*inc1+start1] = log10(vec2[i*inc2+start2]);
1330  }
1331 
1332  template <typename T>
1335  {
1336  typedef T value_type;
1337 
1338  vec_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1339  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1340  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1341  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1342  detail::cuda_arg<value_type>(proxy.lhs()),
1343  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1344  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1345  );
1346  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log10_kernel");
1347  }
1348 
1349 
1350  // sin
1351  template <typename T> __global__ void vec_element_sin_kernel(
1352  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1353  T const * vec2, unsigned int start2, unsigned int inc2)
1354  {
1355  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1356  vec1[i*inc1+start1] = sin(vec2[i*inc2+start2]);
1357  }
1358 
1359  template <typename T>
1362  {
1363  typedef T value_type;
1364 
1365  vec_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1366  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1367  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1368  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1369  detail::cuda_arg<value_type>(proxy.lhs()),
1370  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1371  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1372  );
1373  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sin_kernel");
1374  }
1375 
1376 
1377  // sinh
1378  template <typename T> __global__ void vec_element_sinh_kernel(
1379  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1380  T const * vec2, unsigned int start2, unsigned int inc2)
1381  {
1382  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1383  vec1[i*inc1+start1] = sinh(vec2[i*inc2+start2]);
1384  }
1385 
1386  template <typename T>
1389  {
1390  typedef T value_type;
1391 
1392  vec_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1393  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1394  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1395  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1396  detail::cuda_arg<value_type>(proxy.lhs()),
1397  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1398  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1399  );
1400  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sinh_kernel");
1401  }
1402 
1403 
1404  // sqrt
1405  template <typename T> __global__ void vec_element_sqrt_kernel(
1406  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1407  T const * vec2, unsigned int start2, unsigned int inc2)
1408  {
1409  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1410  vec1[i*inc1+start1] = sqrt(vec2[i*inc2+start2]);
1411  }
1412 
1413  template <typename T>
1416  {
1417  typedef T value_type;
1418 
1419  vec_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1420  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1421  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1422  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1423  detail::cuda_arg<value_type>(proxy.lhs()),
1424  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1425  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1426  );
1427  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sqrt_kernel");
1428  }
1429 
1430 
1431  // tan
1432  template <typename T> __global__ void vec_element_tan_kernel(
1433  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1434  T const * vec2, unsigned int start2, unsigned int inc2)
1435  {
1436  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1437  vec1[i*inc1+start1] = tan(vec2[i*inc2+start2]);
1438  }
1439 
1440  template <typename T>
1443  {
1444  typedef T value_type;
1445 
1446  vec_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1447  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1448  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1449  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1450  detail::cuda_arg<value_type>(proxy.lhs()),
1451  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1452  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1453  );
1454  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tan_kernel");
1455  }
1456 
1457 
1458  // tanh
1459  template <typename T> __global__ void vec_element_tanh_kernel(
1460  T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1461  T const * vec2, unsigned int start2, unsigned int inc2)
1462  {
1463  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1464  vec1[i*inc1+start1] = tanh(vec2[i*inc2+start2]);
1465  }
1466 
1467  template <typename T>
1470  {
1471  typedef T value_type;
1472 
1473  vec_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1474  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1475  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1476  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1477  detail::cuda_arg<value_type>(proxy.lhs()),
1478  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1479  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1480  );
1481  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tanh_kernel");
1482  }
1483 
1484 
1485 
1487 
1488 
1489  template <typename T>
1490  __global__ void inner_prod_kernel(const T * vec1,
1491  unsigned int start1,
1492  unsigned int inc1,
1493  unsigned int size1,
1494  const T * vec2,
1495  unsigned int start2,
1496  unsigned int inc2,
1497  unsigned int size2,
1498  T * group_buffer)
1499  {
1500  __shared__ T tmp_buffer[128];
1501  unsigned int group_start1 = (blockIdx.x * size1) / (gridDim.x) * inc1 + start1;
1502  unsigned int group_start2 = (blockIdx.x * size2) / (gridDim.x) * inc2 + start2;
1503 
1504  unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
1505  - ( blockIdx.x * size1) / (gridDim.x);
1506 
1507 
1508  T tmp = 0;
1509  for (unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
1510  tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
1511  tmp_buffer[threadIdx.x] = tmp;
1512 
1513  // parallel reduction
1514  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
1515  {
1516  __syncthreads();
1517  if (threadIdx.x < stride)
1518  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
1519  }
1520 
1521  if (threadIdx.x == 0)
1522  group_buffer[blockIdx.x] = tmp_buffer[0];
1523 
1524  }
1525 
1526 
1527 
1528  // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
1529  template <typename T>
1530  __global__ void vector_sum_kernel_floats(
1531  const T * vec1,
1532  unsigned int start1,
1533  unsigned int inc1,
1534  unsigned int size1,
1535  unsigned int option, //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
1536  T * result)
1537  {
1538  __shared__ T tmp_buffer[128];
1539  T thread_sum = 0;
1540  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1541  {
1542  if (option > 0)
1543  thread_sum += vec1[i*inc1+start1];
1544  else
1545  thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
1546  }
1547 
1548  tmp_buffer[threadIdx.x] = thread_sum;
1549 
1550  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
1551  {
1552  __syncthreads();
1553  if (threadIdx.x < stride)
1554  {
1555  if (option > 0)
1556  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
1557  else
1558  tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x + stride]);
1559  }
1560  }
1561 
1562  if (threadIdx.x == 0)
1563  {
1564  if (option == 2)
1565  *result = sqrt(tmp_buffer[0]);
1566  else
1567  *result = tmp_buffer[0];
1568  }
1569  }
1570 
1571  template <typename T>
1573  const T * vec1,
1574  unsigned int start1,
1575  unsigned int inc1,
1576  unsigned int size1,
1577  unsigned int option, //0: use max, 1: just sum
1578  T * result)
1579  {
1580  __shared__ T tmp_buffer[128];
1581  T thread_sum = 0;
1582  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1583  {
1584  if (option > 0)
1585  thread_sum += vec1[i*inc1+start1];
1586  else
1587  thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]);
1588  }
1589 
1590  tmp_buffer[threadIdx.x] = thread_sum;
1591 
1592  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
1593  {
1594  __syncthreads();
1595  if (threadIdx.x < stride)
1596  {
1597  if (option > 0)
1598  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
1599  else
1600  tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
1601  }
1602  }
1603 
1604  if (threadIdx.x == 0)
1605  *result = tmp_buffer[0];
1606  }
1607 
1608  template <typename T>
1610  const T * vec1,
1611  unsigned int start1,
1612  unsigned int inc1,
1613  unsigned int size1,
1614  unsigned int option, //0: use max, 1: just sum
1615  T * result)
1616  {
1617  __shared__ T tmp_buffer[128];
1618  T thread_sum = 0;
1619  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1620  {
1621  if (option > 0)
1622  thread_sum += vec1[i*inc1+start1];
1623  else
1624  thread_sum = (thread_sum > vec1[i*inc1+start1]) ? thread_sum : vec1[i*inc1+start1];
1625  }
1626 
1627  tmp_buffer[threadIdx.x] = thread_sum;
1628 
1629  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
1630  {
1631  __syncthreads();
1632  if (threadIdx.x < stride)
1633  {
1634  if (option > 0)
1635  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
1636  else
1637  tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
1638  }
1639  }
1640 
1641  if (threadIdx.x == 0)
1642  *result = tmp_buffer[0];
1643  }
1644 
1645  namespace detail
1646  {
1648  struct vector_sum_kernel_launcher_integers
1649  {
1650  template <typename T, typename S3>
1651  static void apply(vector_base<T> const & temp,
1652  unsigned int option,
1653  S3 & result)
1654  {
1655  typedef T value_type;
1656  vector_sum_kernel_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
1657  static_cast<unsigned int>(viennacl::traits::start(temp)),
1658  static_cast<unsigned int>(viennacl::traits::stride(temp)),
1659  static_cast<unsigned int>(viennacl::traits::size(temp)),
1660  static_cast<unsigned int>(option),
1661  detail::cuda_arg<value_type>(result) );
1662  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
1663  }
1664  };
1665 
1666  struct vector_sum_kernel_launcher_unsigned_integers
1667  {
1668  template <typename T, typename S3>
1669  static void apply(vector_base<T> const & temp,
1670  unsigned int option,
1671  S3 & result)
1672  {
1673  typedef T value_type;
1674  vector_sum_kernel_unsigned_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
1675  static_cast<unsigned int>(viennacl::traits::start(temp)),
1676  static_cast<unsigned int>(viennacl::traits::stride(temp)),
1677  static_cast<unsigned int>(viennacl::traits::size(temp)),
1678  static_cast<unsigned int>(option),
1679  detail::cuda_arg<value_type>(result) );
1680  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
1681  }
1682  };
1683 
1684  struct vector_sum_kernel_launcher_floats
1685  {
1686  template <typename T, typename S3>
1687  static void apply(vector_base<T> const & temp,
1688  unsigned int option,
1689  S3 & result)
1690  {
1691  typedef T value_type;
1692  vector_sum_kernel_floats<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
1693  static_cast<unsigned int>(viennacl::traits::start(temp)),
1694  static_cast<unsigned int>(viennacl::traits::stride(temp)),
1695  static_cast<unsigned int>(viennacl::traits::size(temp)),
1696  static_cast<unsigned int>(option),
1697  detail::cuda_arg<value_type>(result) );
1698  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
1699  }
1700  };
1701 
1702  template <typename T>
1703  struct vector_sum_kernel_launcher : public vector_sum_kernel_launcher_integers {};
1704 
1705  template <>
1706  struct vector_sum_kernel_launcher<unsigned char> : public vector_sum_kernel_launcher_unsigned_integers {};
1707 
1708  template <>
1709  struct vector_sum_kernel_launcher<unsigned short> : public vector_sum_kernel_launcher_unsigned_integers {};
1710 
1711  template <>
1712  struct vector_sum_kernel_launcher<unsigned int> : public vector_sum_kernel_launcher_unsigned_integers {};
1713 
1714  template <>
1715  struct vector_sum_kernel_launcher<unsigned long> : public vector_sum_kernel_launcher_unsigned_integers {};
1716 
1717  template <>
1718  struct vector_sum_kernel_launcher<float> : public vector_sum_kernel_launcher_floats {};
1719 
1720  template <>
1721  struct vector_sum_kernel_launcher<double> : public vector_sum_kernel_launcher_floats {};
1722 
1724  }
1725 
1726 
1727  //implementation of inner product:
1728  //namespace {
1735  template <typename T, typename S3>
1736  void inner_prod_impl(vector_base<T> const & vec1,
1737  vector_base<T> const & vec2,
1738  S3 & result)
1739  {
1740  typedef T value_type;
1741 
1742  static const unsigned int work_groups = 128;
1743  static viennacl::vector<value_type> temp(work_groups);
1744 
1745  inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1746  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1747  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1748  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1749  detail::cuda_arg<value_type>(vec2),
1750  static_cast<unsigned int>(viennacl::traits::start(vec2)),
1751  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
1752  static_cast<unsigned int>(viennacl::traits::size(vec2)),
1753  detail::cuda_arg<value_type>(temp)
1754  );
1755  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
1756 
1757  detail::vector_sum_kernel_launcher<T>::apply(temp, 1, result);
1758  }
1759 
1760 
1767  template <typename T>
1768  void inner_prod_cpu(vector_base<T> const & vec1,
1769  vector_base<T> const & vec2,
1770  T & result)
1771  {
1772  typedef T value_type;
1773 
1774  const unsigned int work_groups = 128;
1775  viennacl::vector<value_type> temp(work_groups);
1776 
1777  inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1778  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1779  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1780  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1781  detail::cuda_arg<value_type>(vec2),
1782  static_cast<unsigned int>(viennacl::traits::start(vec2)),
1783  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
1784  static_cast<unsigned int>(viennacl::traits::size(vec2)),
1785  detail::cuda_arg<value_type>(temp)
1786  );
1787  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
1788 
1789  // Now copy partial results from GPU back to CPU and run reduction there:
1790  std::vector<value_type> temp_cpu(work_groups);
1791  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
1792 
1793  result = 0;
1794  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
1795  result += *it;
1796  }
1797 
1799 
1800 #define VIENNACL_MDOT_WORKGROUP_SIZE 128
1801 #define VIENNACL_MDOT_WORKGROUP_NUM 128
1802  // M = 2:
1803  template <typename NumericT>
1804  __global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
1805  const NumericT *y0, unsigned int start0, unsigned int stride0,
1806  const NumericT *y1, unsigned int start1, unsigned int stride1,
1807  NumericT *group_results)
1808  {
1809  __shared__ NumericT tmp_buffer[2*VIENNACL_MDOT_WORKGROUP_SIZE];
1810  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1811  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1812  unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond size of x
1813 
1814  NumericT entry_x = 0;
1815  NumericT group_sum0 = 0;
1816  NumericT group_sum1 = 0;
1817  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1818  entry_x = x[i * stridex + startx]; // load only once from global memory!
1819  group_sum0 += entry_x * y0[i * stride0 + start0];
1820  group_sum1 += entry_x * y1[i * stride1 + start1];
1821  }
1822  tmp_buffer[threadIdx.x] = group_sum0;
1823  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1824 
1825  // parallel reduction
1826  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
1827  __syncthreads();
1828  if (threadIdx.x < stride) {
1829  tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1830  tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
1831  }
1832  }
1833 
1834  // write result of group to group_results
1835  if (threadIdx.x == 0) {
1836  group_results[blockIdx.x] = tmp_buffer[0];
1837  group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
1838  }
1839  }
1840 
1841  // M = 3:
1842  template <typename NumericT>
1843  __global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
1844  const NumericT *y0, unsigned int start0, unsigned int stride0,
1845  const NumericT *y1, unsigned int start1, unsigned int stride1,
1846  const NumericT *y2, unsigned int start2, unsigned int stride2,
1847  NumericT *group_results)
1848  {
1849  __shared__ NumericT tmp_buffer[3*VIENNACL_MDOT_WORKGROUP_SIZE];
1850  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1851  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1852  unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
1853 
1854  NumericT entry_x = 0;
1855  NumericT group_sum0 = 0;
1856  NumericT group_sum1 = 0;
1857  NumericT group_sum2 = 0;
1858  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1859  entry_x = x[i * stridex + startx]; // load only once from global memory!
1860  group_sum0 += entry_x * y0[i * stride0 + start0];
1861  group_sum1 += entry_x * y1[i * stride1 + start1];
1862  group_sum2 += entry_x * y2[i * stride2 + start2];
1863  }
1864  tmp_buffer[threadIdx.x] = group_sum0;
1865  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1866  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1867 
1868  // parallel reduction
1869  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
1870  __syncthreads();
1871  if (threadIdx.x < stride) {
1872  tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1873  tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
1874  tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
1875  }
1876  }
1877 
1878  // write result of group to group_results
1879  if (threadIdx.x == 0) {
1880  group_results[blockIdx.x ] = tmp_buffer[0];
1881  group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
1882  group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
1883  }
1884  }
1885 
1886  // M = 4:
1887  template <typename NumericT>
1888  __global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
1889  const NumericT *y0, unsigned int start0, unsigned int stride0,
1890  const NumericT *y1, unsigned int start1, unsigned int stride1,
1891  const NumericT *y2, unsigned int start2, unsigned int stride2,
1892  const NumericT *y3, unsigned int start3, unsigned int stride3,
1893  NumericT *group_results)
1894  {
1895  __shared__ NumericT tmp_buffer[4*VIENNACL_MDOT_WORKGROUP_SIZE];
1896  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1897  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1898  unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
1899 
1900  NumericT entry_x = 0;
1901  NumericT group_sum0 = 0;
1902  NumericT group_sum1 = 0;
1903  NumericT group_sum2 = 0;
1904  NumericT group_sum3 = 0;
1905  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1906  entry_x = x[i * stridex + startx]; // load only once from global memory!
1907  group_sum0 += entry_x * y0[i * stride0 + start0];
1908  group_sum1 += entry_x * y1[i * stride1 + start1];
1909  group_sum2 += entry_x * y2[i * stride2 + start2];
1910  group_sum3 += entry_x * y3[i * stride3 + start3];
1911  }
1912  tmp_buffer[threadIdx.x] = group_sum0;
1913  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1914  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1915  tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
1916 
1917  // parallel reduction
1918  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
1919  __syncthreads();
1920  if (threadIdx.x < stride) {
1921  tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1922  tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
1923  tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
1924  tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
1925  }
1926  }
1927 
1928  // write result of group to group_results
1929  if (threadIdx.x == 0) {
1930  group_results[blockIdx.x ] = tmp_buffer[0];
1931  group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
1932  group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
1933  group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
1934  }
1935  }
1936 
1937  // M = 8:
1938  template <typename NumericT>
1939  __global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
1940  const NumericT *y0, unsigned int start0, unsigned int stride0,
1941  const NumericT *y1, unsigned int start1, unsigned int stride1,
1942  const NumericT *y2, unsigned int start2, unsigned int stride2,
1943  const NumericT *y3, unsigned int start3, unsigned int stride3,
1944  const NumericT *y4, unsigned int start4, unsigned int stride4,
1945  const NumericT *y5, unsigned int start5, unsigned int stride5,
1946  const NumericT *y6, unsigned int start6, unsigned int stride6,
1947  const NumericT *y7, unsigned int start7, unsigned int stride7,
1948  NumericT *group_results)
1949  {
1950  __shared__ NumericT tmp_buffer[8*VIENNACL_MDOT_WORKGROUP_SIZE];
1951  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1952  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1953  unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
1954 
1955  NumericT entry_x = 0;
1956  NumericT group_sum0 = 0;
1957  NumericT group_sum1 = 0;
1958  NumericT group_sum2 = 0;
1959  NumericT group_sum3 = 0;
1960  NumericT group_sum4 = 0;
1961  NumericT group_sum5 = 0;
1962  NumericT group_sum6 = 0;
1963  NumericT group_sum7 = 0;
1964  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1965  entry_x = x[i * stridex + startx]; // load only once from global memory!
1966  group_sum0 += entry_x * y0[i * stride0 + start0];
1967  group_sum1 += entry_x * y1[i * stride1 + start1];
1968  group_sum2 += entry_x * y2[i * stride2 + start2];
1969  group_sum3 += entry_x * y3[i * stride3 + start3];
1970  group_sum4 += entry_x * y4[i * stride4 + start4];
1971  group_sum5 += entry_x * y5[i * stride5 + start5];
1972  group_sum6 += entry_x * y6[i * stride6 + start6];
1973  group_sum7 += entry_x * y7[i * stride7 + start7];
1974  }
1975  tmp_buffer[threadIdx.x] = group_sum0;
1976  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1977  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1978  tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
1979  tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
1980  tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
1981  tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
1982  tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
1983 
1984  // parallel reduction
1985  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
1986  __syncthreads();
1987  if (threadIdx.x < stride) {
1988  tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1989  tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
1990  tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
1991  tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
1992  tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 4 * blockDim.x];
1993  tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 5 * blockDim.x];
1994  tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 6 * blockDim.x];
1995  tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 7 * blockDim.x];
1996  }
1997  }
1998 
1999  // write result of group to group_results
2000  if (threadIdx.x == 0) {
2001  group_results[blockIdx.x ] = tmp_buffer[0];
2002  group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
2003  group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
2004  group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
2005  group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
2006  group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
2007  group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
2008  group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
2009  }
2010  }
2011 
2012  // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
2013  template <typename T>
2014  __global__ void vector_multi_sum_kernel(
2015  T const * vec1,
2016  T * result,
2017  unsigned int start_result,
2018  unsigned int inc_result)
2019  {
2020  __shared__ T tmp_buffer[VIENNACL_MDOT_WORKGROUP_SIZE];
2021 
2022  tmp_buffer[threadIdx.x] = vec1[threadIdx.x + blockIdx.x * VIENNACL_MDOT_WORKGROUP_SIZE];
2023 
2024  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2025  {
2026  __syncthreads();
2027  if (threadIdx.x < stride)
2028  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
2029  }
2030 
2031  if (threadIdx.x == 0)
2032  result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
2033  }
2034 
2035  template <typename T>
2037  vector_tuple<T> const & vec_tuple,
2038  vector_base<T> & result)
2039  {
2040  typedef T value_type;
2041 
2043 
2044  vcl_size_t current_index = 0;
2045  while (vec_tuple.const_size() > current_index)
2046  {
2047  switch (vec_tuple.const_size() - current_index)
2048  {
2049  case 7:
2050  case 6:
2051  case 5:
2052  case 4:
2053  {
2054  vector_base<T> const & y0 = vec_tuple.const_at(current_index);
2055  vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
2056  vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
2057  vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3);
2058 
2060  VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
2061  static_cast<unsigned int>(viennacl::traits::start(x)),
2062  static_cast<unsigned int>(viennacl::traits::stride(x)),
2063  static_cast<unsigned int>(viennacl::traits::size(x)),
2064  detail::cuda_arg<value_type>(y0),
2065  static_cast<unsigned int>(viennacl::traits::start(y0)),
2066  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2067  detail::cuda_arg<value_type>(y1),
2068  static_cast<unsigned int>(viennacl::traits::start(y1)),
2069  static_cast<unsigned int>(viennacl::traits::stride(y1)),
2070  detail::cuda_arg<value_type>(y2),
2071  static_cast<unsigned int>(viennacl::traits::start(y2)),
2072  static_cast<unsigned int>(viennacl::traits::stride(y2)),
2073  detail::cuda_arg<value_type>(y3),
2074  static_cast<unsigned int>(viennacl::traits::start(y3)),
2075  static_cast<unsigned int>(viennacl::traits::stride(y3)),
2076  detail::cuda_arg<value_type>(temp)
2077  );
2078  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_4_kernel");
2079  vector_multi_sum_kernel<<<4, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2080  detail::cuda_arg<value_type>(result),
2081  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2082  static_cast<unsigned int>(viennacl::traits::stride(result))
2083  );
2084  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2085  }
2086  current_index += 4;
2087  break;
2088  case 3:
2089  {
2090  vector_base<T> const & y0 = vec_tuple.const_at(current_index);
2091  vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
2092  vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
2093 
2095  VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
2096  static_cast<unsigned int>(viennacl::traits::start(x)),
2097  static_cast<unsigned int>(viennacl::traits::stride(x)),
2098  static_cast<unsigned int>(viennacl::traits::size(x)),
2099  detail::cuda_arg<value_type>(y0),
2100  static_cast<unsigned int>(viennacl::traits::start(y0)),
2101  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2102  detail::cuda_arg<value_type>(y1),
2103  static_cast<unsigned int>(viennacl::traits::start(y1)),
2104  static_cast<unsigned int>(viennacl::traits::stride(y1)),
2105  detail::cuda_arg<value_type>(y2),
2106  static_cast<unsigned int>(viennacl::traits::start(y2)),
2107  static_cast<unsigned int>(viennacl::traits::stride(y2)),
2108  detail::cuda_arg<value_type>(temp)
2109  );
2110  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_3_kernel");
2111  vector_multi_sum_kernel<<<3, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2112  detail::cuda_arg<value_type>(result),
2113  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2114  static_cast<unsigned int>(viennacl::traits::stride(result))
2115  );
2116  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2117  }
2118  current_index += 3;
2119  break;
2120  case 2:
2121  {
2122  vector_base<T> const & y0 = vec_tuple.const_at(current_index);
2123  vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
2124 
2126  VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
2127  static_cast<unsigned int>(viennacl::traits::start(x)),
2128  static_cast<unsigned int>(viennacl::traits::stride(x)),
2129  static_cast<unsigned int>(viennacl::traits::size(x)),
2130  detail::cuda_arg<value_type>(y0),
2131  static_cast<unsigned int>(viennacl::traits::start(y0)),
2132  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2133  detail::cuda_arg<value_type>(y1),
2134  static_cast<unsigned int>(viennacl::traits::start(y1)),
2135  static_cast<unsigned int>(viennacl::traits::stride(y1)),
2136  detail::cuda_arg<value_type>(temp)
2137  );
2138  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_2_kernel");
2139  vector_multi_sum_kernel<<<2, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2140  detail::cuda_arg<value_type>(result),
2141  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2142  static_cast<unsigned int>(viennacl::traits::stride(result))
2143  );
2144  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2145  }
2146  current_index += 2;
2147  break;
2148  case 1:
2149  {
2150  vector_base<T> const & y0 = vec_tuple.const_at(current_index);
2151  inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(x),
2152  static_cast<unsigned int>(viennacl::traits::start(x)),
2153  static_cast<unsigned int>(viennacl::traits::stride(x)),
2154  static_cast<unsigned int>(viennacl::traits::size(x)),
2155  detail::cuda_arg<value_type>(y0),
2156  static_cast<unsigned int>(viennacl::traits::start(y0)),
2157  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2158  static_cast<unsigned int>(viennacl::traits::size(y0)),
2159  detail::cuda_arg<value_type>(temp)
2160  );
2161  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
2162 
2163  vector_multi_sum_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
2164  detail::cuda_arg<value_type>(result),
2165  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2166  static_cast<unsigned int>(viennacl::traits::stride(result))
2167  );
2168  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2169  }
2170  current_index += 1;
2171  break;
2172 
2173  default:
2174  {
2175  vector_base<T> const & y0 = vec_tuple.const_at(current_index);
2176  vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
2177  vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
2178  vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3);
2179  vector_base<T> const & y4 = vec_tuple.const_at(current_index + 4);
2180  vector_base<T> const & y5 = vec_tuple.const_at(current_index + 5);
2181  vector_base<T> const & y6 = vec_tuple.const_at(current_index + 6);
2182  vector_base<T> const & y7 = vec_tuple.const_at(current_index + 7);
2183 
2185  VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
2186  static_cast<unsigned int>(viennacl::traits::start(x)),
2187  static_cast<unsigned int>(viennacl::traits::stride(x)),
2188  static_cast<unsigned int>(viennacl::traits::size(x)),
2189  detail::cuda_arg<value_type>(y0),
2190  static_cast<unsigned int>(viennacl::traits::start(y0)),
2191  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2192  detail::cuda_arg<value_type>(y1),
2193  static_cast<unsigned int>(viennacl::traits::start(y1)),
2194  static_cast<unsigned int>(viennacl::traits::stride(y1)),
2195  detail::cuda_arg<value_type>(y2),
2196  static_cast<unsigned int>(viennacl::traits::start(y2)),
2197  static_cast<unsigned int>(viennacl::traits::stride(y2)),
2198  detail::cuda_arg<value_type>(y3),
2199  static_cast<unsigned int>(viennacl::traits::start(y3)),
2200  static_cast<unsigned int>(viennacl::traits::stride(y3)),
2201  detail::cuda_arg<value_type>(y4),
2202  static_cast<unsigned int>(viennacl::traits::start(y4)),
2203  static_cast<unsigned int>(viennacl::traits::stride(y4)),
2204  detail::cuda_arg<value_type>(y5),
2205  static_cast<unsigned int>(viennacl::traits::start(y5)),
2206  static_cast<unsigned int>(viennacl::traits::stride(y5)),
2207  detail::cuda_arg<value_type>(y6),
2208  static_cast<unsigned int>(viennacl::traits::start(y6)),
2209  static_cast<unsigned int>(viennacl::traits::stride(y6)),
2210  detail::cuda_arg<value_type>(y7),
2211  static_cast<unsigned int>(viennacl::traits::start(y7)),
2212  static_cast<unsigned int>(viennacl::traits::stride(y7)),
2213  detail::cuda_arg<value_type>(temp)
2214  );
2215  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_8_kernel");
2216  vector_multi_sum_kernel<<<8, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2217  detail::cuda_arg<value_type>(result),
2218  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2219  static_cast<unsigned int>(viennacl::traits::stride(result))
2220  );
2221  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2222  }
2223  current_index += 8;
2224  break;
2225  }
2226  }
2227  }
2228 
2229 #undef VIENNACL_MDOT_WORKGROUP_NUM
2230 #undef VIENNACL_MDOT_WORKGROUP_SIZE
2231 
2233 
2234  template <typename T>
2235  __global__ void norm_kernel_floats(
2236  const T * vec,
2237  unsigned int start1,
2238  unsigned int inc1,
2239  unsigned int size1,
2240  unsigned int norm_selector,
2241  T * group_buffer)
2242  {
2243  __shared__ T tmp_buffer[128];
2244 
2245  T tmp = 0;
2246  unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2247  unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2248  unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2249  group_stop = (group_stop > size1) ? size1 : group_stop;
2250 
2251  if (norm_selector == 1) //norm_1
2252  {
2253  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2254  tmp += fabs(vec[i*inc1 + start1]);
2255  }
2256  else if (norm_selector == 2) //norm_2
2257  {
2258  T vec_entry = 0;
2259  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2260  {
2261  vec_entry = vec[i*inc1 + start1];
2262  tmp += vec_entry * vec_entry;
2263  }
2264  }
2265  else if (norm_selector == 0) //norm_inf
2266  {
2267  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2268  tmp = fmax(fabs(vec[i*inc1 + start1]), tmp);
2269  }
2270 
2271  tmp_buffer[threadIdx.x] = tmp;
2272 
2273  if (norm_selector > 0) //parallel reduction for norm_1 or norm_2:
2274  {
2275  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2276  {
2277  __syncthreads();
2278  if (threadIdx.x < stride)
2279  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
2280  }
2281  }
2282  else
2283  {
2284  //norm_inf:
2285  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2286  {
2287  __syncthreads();
2288  if (threadIdx.x < stride)
2289  tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x+stride]);
2290  }
2291  }
2292 
2293  if (threadIdx.x == 0)
2294  group_buffer[blockIdx.x] = tmp_buffer[0];
2295  }
2296 
2297  template <typename T>
2298  __global__ void norm_kernel_integers(
2299  const T * vec,
2300  unsigned int start1,
2301  unsigned int inc1,
2302  unsigned int size1,
2303  unsigned int norm_selector,
2304  T * group_buffer)
2305  {
2306  __shared__ T tmp_buffer[128];
2307 
2308  T tmp = 0;
2309  unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2310  unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2311  unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2312  group_stop = (group_stop > size1) ? size1 : group_stop;
2313 
2314  if (norm_selector == 1) //norm_1
2315  {
2316  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2317  tmp += abs(vec[i*inc1 + start1]);
2318  }
2319  else if (norm_selector == 0) //norm_inf
2320  {
2321  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2322  tmp = (tmp > abs(vec[i*inc1 + start1])) ? tmp : abs(vec[i*inc1 + start1]);
2323  }
2324 
2325  tmp_buffer[threadIdx.x] = tmp;
2326 
2327  if (norm_selector > 0) //parallel reduction for norm_1 or norm_2:
2328  {
2329  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2330  {
2331  __syncthreads();
2332  if (threadIdx.x < stride)
2333  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
2334  }
2335  }
2336  else
2337  {
2338  //norm_inf:
2339  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2340  {
2341  __syncthreads();
2342  if (threadIdx.x < stride)
2343  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+stride];
2344  }
2345  }
2346 
2347  if (threadIdx.x == 0)
2348  group_buffer[blockIdx.x] = tmp_buffer[0];
2349  }
2350 
2351  template <typename T>
2353  const T * vec,
2354  unsigned int start1,
2355  unsigned int inc1,
2356  unsigned int size1,
2357  unsigned int norm_selector,
2358  T * group_buffer)
2359  {
2360  __shared__ T tmp_buffer[128];
2361 
2362  T tmp = 0;
2363  unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2364  unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2365  unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2366  group_stop = (group_stop > size1) ? size1 : group_stop;
2367 
2368  if (norm_selector == 1) //norm_1
2369  {
2370  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2371  tmp += vec[i*inc1 + start1];
2372  }
2373  else if (norm_selector == 0) //norm_inf
2374  {
2375  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2376  tmp = (tmp > vec[i*inc1 + start1]) ? tmp : vec[i*inc1 + start1];
2377  }
2378 
2379  tmp_buffer[threadIdx.x] = tmp;
2380 
2381  if (norm_selector > 0) //parallel reduction for norm_1 or norm_2:
2382  {
2383  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2384  {
2385  __syncthreads();
2386  if (threadIdx.x < stride)
2387  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
2388  }
2389  }
2390  else
2391  {
2392  //norm_inf:
2393  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2394  {
2395  __syncthreads();
2396  if (threadIdx.x < stride)
2397  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+stride];
2398  }
2399  }
2400 
2401  if (threadIdx.x == 0)
2402  group_buffer[blockIdx.x] = tmp_buffer[0];
2403  }
2404 
2406  namespace detail
2407  {
2408  struct norm_kernel_launcher_integers
2409  {
2410  template <typename T>
2411  static void apply(vector_base<T> const & vec1,
2412  vector_base<T> & temp,
2413  unsigned int option)
2414  {
2415  typedef T value_type;
2416  norm_kernel_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2417  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2418  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2419  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2420  static_cast<unsigned int>(option),
2421  detail::cuda_arg<value_type>(temp)
2422  );
2423  VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
2424  }
2425  };
2426 
2427  struct norm_kernel_launcher_unsigned_integers
2428  {
2429  template <typename T>
2430  static void apply(vector_base<T> const & vec1,
2431  vector_base<T> & temp,
2432  unsigned int option)
2433  {
2434  typedef T value_type;
2435  norm_kernel_unsigned_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2436  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2437  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2438  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2439  static_cast<unsigned int>(option),
2440  detail::cuda_arg<value_type>(temp)
2441  );
2442  VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
2443  }
2444  };
2445 
2446 
2447  struct norm_kernel_launcher_floats
2448  {
2449  template <typename T>
2450  static void apply(vector_base<T> const & vec1,
2451  vector_base<T> & temp,
2452  unsigned int option)
2453  {
2454  typedef T value_type;
2455  norm_kernel_floats<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2456  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2457  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2458  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2459  static_cast<unsigned int>(option),
2460  detail::cuda_arg<value_type>(temp)
2461  );
2462  VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
2463  }
2464  };
2465 
2466  template <typename T>
2467  struct norm_kernel_launcher : public norm_kernel_launcher_integers {};
2468 
2469  template <>
2470  struct norm_kernel_launcher<unsigned char> : public norm_kernel_launcher_unsigned_integers {};
2471 
2472  template <>
2473  struct norm_kernel_launcher<unsigned short> : public norm_kernel_launcher_unsigned_integers {};
2474 
2475  template <>
2476  struct norm_kernel_launcher<unsigned int> : public norm_kernel_launcher_unsigned_integers {};
2477 
2478  template <>
2479  struct norm_kernel_launcher<unsigned long> : public norm_kernel_launcher_unsigned_integers {};
2480 
2481  template <>
2482  struct norm_kernel_launcher<float> : public norm_kernel_launcher_floats {};
2483 
2484  template <>
2485  struct norm_kernel_launcher<double> : public norm_kernel_launcher_floats {};
2486 
2487  }
2496  template <typename T>
2497  void norm_1_impl(vector_base<T> const & vec1,
2498  scalar<T> & result)
2499  {
2500  typedef T value_type;
2501 
2502  vcl_size_t work_groups = 128;
2503  viennacl::vector<value_type> temp(work_groups);
2504 
2505  detail::norm_kernel_launcher<T>::apply(vec1, temp, 1);
2506  detail::vector_sum_kernel_launcher<T>::apply(temp, 1, result);
2507  }
2508 
2514  template <typename T>
2515  void norm_1_cpu(vector_base<T> const & vec1,
2516  T & result)
2517  {
2518  typedef T value_type;
2519 
2520  vcl_size_t work_groups = 128;
2521  viennacl::vector<value_type> temp(work_groups);
2522 
2523  detail::norm_kernel_launcher<T>::apply(vec1, temp, 1);
2524 
2525  // Now copy partial results from GPU back to CPU and run reduction there:
2526  std::vector<value_type> temp_cpu(work_groups);
2527  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
2528 
2529  result = 0;
2530  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2531  result += *it;
2532  }
2533 
2535 
2541  template <typename T>
2542  void norm_2_impl(vector_base<T> const & vec1,
2543  scalar<T> & result)
2544  {
2545  typedef T value_type;
2546 
2547  vcl_size_t work_groups = 128;
2548  viennacl::vector<value_type> temp(work_groups);
2549 
2550  detail::norm_kernel_launcher<T>::apply(vec1, temp, 2);
2551 
2552  detail::vector_sum_kernel_launcher<T>::apply(temp, 2, result);
2553  }
2554 
2560  template <typename T>
2561  void norm_2_cpu(vector_base<T> const & vec1,
2562  T & result)
2563  {
2564  typedef T value_type;
2565 
2566  vcl_size_t work_groups = 128;
2567  viennacl::vector<value_type> temp(work_groups);
2568 
2569  detail::norm_kernel_launcher<T>::apply(vec1, temp, 2);
2570 
2571  std::vector<value_type> temp_cpu(work_groups);
2572  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
2573 
2574  result = 0;
2575  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2576  result += *it;
2577  result = std::sqrt(result);
2578  }
2579 
2580 
2582 
2588  template <typename T>
2589  void norm_inf_impl(vector_base<T> const & vec1,
2590  scalar<T> & result)
2591  {
2592  typedef T value_type;
2593 
2594  vcl_size_t work_groups = 128;
2595  viennacl::vector<value_type> temp(work_groups);
2596 
2597  detail::norm_kernel_launcher<T>::apply(vec1, temp, 0);
2598  detail::vector_sum_kernel_launcher<T>::apply(temp, 0, result);
2599  }
2600 
2601 
2602 
2608  template <typename T>
2609  void norm_inf_cpu(vector_base<T> const & vec1,
2610  T & result)
2611  {
2612  typedef T value_type;
2613 
2614  vcl_size_t work_groups = 128;
2615  viennacl::vector<value_type> temp(work_groups);
2616 
2617  detail::norm_kernel_launcher<T>::apply(vec1, temp, 0);
2618 
2619  std::vector<value_type> temp_cpu(work_groups);
2620  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
2621 
2622  result = 0;
2623  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2624  result = std::max(result, *it);
2625  }
2626 
2627 
2629 
2630 
2631 
2632  //index_norm_inf:
2633 
2634  // fixes the problem of not having (f)abs available in a consistent manner
2635  template <typename T>
2636  __device__ T cuda_abs(T val) { return (val < 0) ? -val : val; }
2637  __device__ inline unsigned long cuda_abs(unsigned long val) { return val; }
2638  __device__ inline unsigned int cuda_abs(unsigned int val) { return val; }
2639  __device__ inline unsigned short cuda_abs(unsigned short val) { return val; }
2640  __device__ inline unsigned char cuda_abs(unsigned char val) { return val; }
2641 
2642  template <typename T>
2643  __global__ void index_norm_inf_kernel(const T * vec,
2644  unsigned int start1,
2645  unsigned int inc1,
2646  unsigned int size1,
2647  unsigned int * result)
2648  {
2649  __shared__ T float_buffer[128];
2650  __shared__ unsigned int index_buffer[128];
2651 
2652  float_buffer[threadIdx.x] = 0;
2653  index_buffer[threadIdx.x] = 0;
2654 
2655  //step 1: fill buffer:
2656  T cur_max = (T)0;
2657  T tmp;
2658  for (unsigned int i = threadIdx.x; i < size1; i += blockDim.x)
2659  {
2660  tmp = vec[i*inc1+start1];
2661  tmp = cuda_abs(tmp);
2662  if (cur_max < tmp)
2663  {
2664  float_buffer[threadIdx.x] = tmp;
2665  index_buffer[threadIdx.x] = i;
2666  cur_max = tmp;
2667  }
2668  }
2669 
2670  //step 2: parallel reduction:
2671  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2672  {
2673  __syncthreads();
2674  if (threadIdx.x < stride)
2675  {
2676  //find the first occurring index
2677  if (float_buffer[threadIdx.x] < float_buffer[threadIdx.x+stride])
2678  {
2679  index_buffer[threadIdx.x] = index_buffer[threadIdx.x+stride];
2680  float_buffer[threadIdx.x] = float_buffer[threadIdx.x+stride];
2681  }
2682  }
2683  }
2684 
2685  if (threadIdx.x == 0)
2686  *result = index_buffer[0];
2687  }
2688 
2689  //This function should return a CPU scalar, otherwise statements like
2690  // vcl_rhs[index_norm_inf(vcl_rhs)]
2691  // are ambiguous
2697  template <typename T>
2699  {
2700  typedef T value_type;
2701 
2703  viennacl::backend::memory_create(h, sizeof(unsigned int), viennacl::traits::context(vec1));
2704 
2705  index_norm_inf_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(vec1),
2706  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2707  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2708  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2709  //detail::cuda_arg<unsigned int>(h.cuda_handle())
2710  reinterpret_cast<unsigned int *>(h.cuda_handle().get())
2711  );
2712  VIENNACL_CUDA_LAST_ERROR_CHECK("index_norm_inf_kernel");
2713 
2714  unsigned int ret = 0;
2715  viennacl::backend::memory_read(h, 0, sizeof(unsigned int), &ret);
2716  return static_cast<vcl_size_t>(ret);
2717  }
2718 
2720 
2721  template <typename T>
2722  __global__ void plane_rotation_kernel(
2723  T * vec1,
2724  unsigned int start1,
2725  unsigned int inc1,
2726  unsigned int size1,
2727  T * vec2,
2728  unsigned int start2,
2729  unsigned int inc2,
2730  unsigned int size2,
2731  T alpha,
2732  T beta)
2733  {
2734  T tmp1 = 0;
2735  T tmp2 = 0;
2736 
2737  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += blockDim.x * gridDim.x)
2738  {
2739  tmp1 = vec1[i*inc1+start1];
2740  tmp2 = vec2[i*inc2+start2];
2741 
2742  vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2;
2743  vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1;
2744  }
2745 
2746  }
2747 
2757  template <typename T>
2759  vector_base<T> & vec2,
2760  T alpha, T beta)
2761  {
2762  typedef T value_type;
2763 
2764  value_type temporary_alpha = 0;
2766  temporary_alpha = alpha;
2767 
2768  value_type temporary_beta = 0;
2770  temporary_beta = beta;
2771 
2772  plane_rotation_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2773  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2774  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2775  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2776  detail::cuda_arg<value_type>(vec2),
2777  static_cast<unsigned int>(viennacl::traits::start(vec2)),
2778  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
2779  static_cast<unsigned int>(viennacl::traits::size(vec2)),
2780  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
2781  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)) );
2782  VIENNACL_CUDA_LAST_ERROR_CHECK("plane_rotation_kernel");
2783  }
2784 
2785  } //namespace opencl
2786  } //namespace linalg
2787 } //namespace viennacl
2788 
2789 
2790 #endif
VectorType const & const_at(vcl_size_t i) const
Definition: vector.hpp:1196
__global__ void vec_element_abs_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1242
std::size_t vcl_size_t
Definition: forwards.h:58
unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
Definition: common.hpp:37
__global__ void avbv_v_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const T *fac2, unsigned int options2, const T *vec2, unsigned int start2, unsigned int inc2, const T *fac3, unsigned int options3, const T *vec3, unsigned int start3, unsigned int inc3)
Definition: vector_operations.hpp:457
__global__ void vector_assign_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int internal_size1, T alpha)
Definition: vector_operations.hpp:756
__global__ void vector_swap_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:800
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:172
result_of::size_type< matrix_base< NumericT, F > >::type stride2(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:68
__global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, NumericT *group_results)
Definition: vector_operations.hpp:1888
__global__ void vec_element_sin_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1351
__global__ void vector_sum_kernel_unsigned_integers(const T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, T *result)
Definition: vector_operations.hpp:1609
Generic size and resize functionality for different vector and matrix types.
viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar< T > &s, U)
Definition: common.hpp:127
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
__global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, NumericT *group_results)
Definition: vector_operations.hpp:1843
Various little tools used here and there in ViennaCL.
void norm_2_cpu(vector_base< T > const &vec1, T &result)
Computes the l^2-norm of a vector - implementation.
Definition: vector_operations.hpp:2561
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
__global__ void avbv_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const T *fac2, unsigned int options2, const T *vec2, unsigned int start2, unsigned int inc2, const T *fac3, unsigned int options3, const T *vec3, unsigned int start3, unsigned int inc3)
Definition: vector_operations.hpp:153
void inner_prod_impl(vector_base< T > const &vec1, vector_base< T > const &vec2, S3 &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
Definition: vector_operations.hpp:1736
__global__ void vec_element_cosh_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1162
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:64
Determines row and column increments for matrices and matrix proxies.
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
Definition: memory.hpp:261
__global__ void vec_element_tanh_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1459
void vector_swap(vector_base< T > &vec1, vector_base< T > &vec2)
Swaps the contents of two vectors, data is copied.
Definition: vector_operations.hpp:827
#define VIENNACL_MDOT_WORKGROUP_SIZE
Definition: vector_operations.hpp:1800
An expression template class that represents a binary operation that yields a vector.
Definition: forwards.h:181
void plane_rotation(vector_base< T > &vec1, vector_base< T > &vec2, T alpha, T beta)
Computes a plane rotation of two vectors.
Definition: vector_operations.hpp:2758
__global__ void vec_element_log_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1297
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
void avbv(vector_base< T > &vec1, vector_base< T > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< T > const &vec3, ScalarType2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Definition: vector_operations.hpp:407
__global__ void vec_element_tan_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1432
void av(vector_base< T > &vec1, vector_base< T > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
Definition: vector_operations.hpp:118
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
__global__ void vector_sum_kernel_floats(const T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, T *result)
Definition: vector_operations.hpp:1530
void avbv_v(vector_base< T > &vec1, vector_base< T > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< T > const &vec3, ScalarType2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Definition: vector_operations.hpp:709
__global__ void vec_element_atan_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1081
#define VIENNACL_MDOT_WORKGROUP_NUM
Definition: vector_operations.hpp:1801
__global__ void vec_element_fabs_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1216
result_of::size_type< matrix_base< NumericT, F > >::type stride1(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:57
void vector_assign(vector_base< T > &vec1, const S1 &alpha, bool up_to_internal_size=false)
Assign a constant value to a vector (-range/-slice)
Definition: vector_operations.hpp:777
void element_op(matrix_base< T, F > &A, matrix_expression< const matrix_base< T, F >, const matrix_base< T, F >, op_element_binary< OP > > const &proxy)
Definition: matrix_operations.hpp:489
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:83
__global__ void vec_element_acos_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1028
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
Definition: forwards.h:363
__global__ void vec_element_sinh_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1378
__device__ T cuda_abs(T val)
Definition: vector_operations.hpp:2636
Tuple class holding pointers to multiple vectors. Mainly used as a temporary object returned from vie...
Definition: forwards.h:211
__global__ void norm_kernel_floats(const T *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, T *group_buffer)
Definition: vector_operations.hpp:2235
void norm_1_cpu(vector_base< T > const &vec1, T &result)
Computes the l^1-norm of a vector.
Definition: vector_operations.hpp:2515
vcl_size_t index_norm_inf(vector_base< T > const &vec1)
Computes the index of the first entry that is equal to the supremum-norm in modulus.
Definition: vector_operations.hpp:2698
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:43
__global__ void inner_prod_kernel(const T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const T *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, T *group_buffer)
Definition: vector_operations.hpp:1490
__global__ void vector_multi_sum_kernel(T const *vec1, T *result, unsigned int start_result, unsigned int inc_result)
Definition: vector_operations.hpp:2014
__global__ void plane_rotation_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, T alpha, T beta)
Definition: vector_operations.hpp:2722
__global__ void vec_element_exp_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1189
__global__ void vec_element_cos_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1135
Common base class for dense vectors, vector ranges, and vector slices.
Definition: forwards.h:205
__global__ void norm_kernel_integers(const T *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, T *group_buffer)
Definition: vector_operations.hpp:2298
A vector class representing a linear memory sequence on the GPU. Inspired by boost::numeric::ublas::v...
Definition: forwards.h:208
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
Definition: predicate.hpp:448
void norm_inf_impl(vector_base< T > const &vec1, scalar< T > &result)
Computes the supremum-norm of a vector.
Definition: vector_operations.hpp:2589
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void norm_1_impl(vector_base< T > const &vec1, scalar< T > &result)
Computes the l^1-norm of a vector.
Definition: vector_operations.hpp:2497
__global__ void vec_element_log10_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1324
iterator begin()
Returns an iterator pointing to the beginning of the vector (STL like)
Definition: vector.hpp:825
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Definition: context.hpp:41
__global__ void element_op_int_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2, T const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
Definition: vector_operations.hpp:891
__global__ void vec_element_asin_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1054
void norm_2_impl(vector_base< T > const &vec1, scalar< T > &result)
Computes the l^2-norm of a vector - implementation.
Definition: vector_operations.hpp:2542
__global__ void vec_element_floor_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1270
__global__ void vec_element_ceil_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1108
__global__ void norm_kernel_unsigned_integers(const T *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, T *group_buffer)
Definition: vector_operations.hpp:2352
void inner_prod_cpu(vector_base< T > const &vec1, vector_base< T > const &vec2, T &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
Definition: vector_operations.hpp:1768
iterator end()
Returns an iterator pointing to the end of the vector (STL like)
Definition: vector.hpp:831
__global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, NumericT *group_results)
Definition: vector_operations.hpp:1804
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:62
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
Definition: forwards.h:86
void memory_create(mem_handle &handle, vcl_size_t size_in_bytes, viennacl::context const &ctx, const void *host_ptr=NULL)
Creates an array of the specified size. If the second argument is provided, the buffer is initialized...
Definition: memory.hpp:87
__global__ void index_norm_inf_kernel(const T *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int *result)
Definition: vector_operations.hpp:2643
__global__ void vec_element_sqrt_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1405
__global__ void element_op_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2, T const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
Definition: vector_operations.hpp:845
vcl_size_t internal_size1(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
Definition: predicate.hpp:418
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Definition: forwards.h:90
Implementation of the ViennaCL scalar class.
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)
STL-like transfer of a GPU vector to the CPU. The cpu type is assumed to reside in a linear piece of ...
Definition: vector.hpp:1284
__global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, const NumericT *y4, unsigned int start4, unsigned int stride4, const NumericT *y5, unsigned int start5, unsigned int stride5, const NumericT *y6, unsigned int start6, unsigned int stride6, const NumericT *y7, unsigned int start7, unsigned int stride7, NumericT *group_results)
Definition: vector_operations.hpp:1939
void norm_inf_cpu(vector_base< T > const &vec1, T &result)
Computes the supremum-norm of a vector.
Definition: vector_operations.hpp:2609
size_type internal_size() const
Returns the internal length of the vector, which is given by size() plus the extra memory due to padd...
Definition: vector.hpp:863
Simple enable-if variant that uses the SFINAE pattern.
__global__ void vector_sum_kernel_integers(const T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, T *result)
Definition: vector_operations.hpp:1572
__global__ void av_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const T *fac2, unsigned int options2, const T *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:51
vcl_size_t const_size() const
Definition: vector.hpp:1193