1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
56 template <
typename NumericT,
typename F,
61 typedef NumericT value_type;
65 value_type temporary_alpha = 0;
67 temporary_alpha = alpha;
71 am_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
79 detail::cuda_arg<value_type>(mat2),
88 am_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
96 detail::cuda_arg<value_type>(mat2),
106 template <
typename NumericT,
typename F,
107 typename ScalarType1,
typename ScalarType2>
112 typedef NumericT value_type;
114 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
116 value_type temporary_alpha = 0;
118 temporary_alpha = alpha;
123 value_type temporary_beta = 0;
125 temporary_beta = beta;
130 ambm_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
138 detail::cuda_arg<value_type>(mat2),
145 detail::cuda_arg<value_type>(mat3),
154 ambm_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
162 detail::cuda_arg<value_type>(mat2),
169 detail::cuda_arg<value_type>(mat3),
180 template <
typename NumericT,
typename F,
181 typename ScalarType1,
typename ScalarType2>
186 typedef NumericT value_type;
188 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
190 value_type temporary_alpha = 0;
192 temporary_alpha = alpha;
197 value_type temporary_beta = 0;
199 temporary_beta = beta;
204 ambm_m_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
212 detail::cuda_arg<value_type>(mat2),
219 detail::cuda_arg<value_type>(mat3),
228 ambm_m_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
236 detail::cuda_arg<value_type>(mat2),
243 detail::cuda_arg<value_type>(mat3),
256 template <
typename NumericT,
typename F>
259 typedef NumericT value_type;
260 value_type alpha = s;
268 matrix_row_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
272 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
278 matrix_col_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
282 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
288 template <
typename NumericT,
typename F>
291 typedef NumericT value_type;
292 value_type alpha = s;
296 matrix_row_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
306 matrix_col_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
317 template <
typename NumericT,
typename F>
320 typedef NumericT value_type;
326 unsigned int options_alpha = 0;
356 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
357 static_cast<unsigned int>(mat_start),
358 static_cast<unsigned int>(mat_stride),
359 static_cast<unsigned int>(mat_size),
361 detail::cuda_arg<value_type>(NumericT(1)),
363 detail::cuda_arg<value_type>(vec),
369 template <
typename NumericT,
typename F>
372 typedef NumericT value_type;
374 unsigned int options_alpha = 0;
403 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
408 detail::cuda_arg<value_type>(NumericT(1)),
410 detail::cuda_arg<value_type>(mat),
411 static_cast<unsigned int>(mat_start),
412 static_cast<unsigned int>(mat_stride));
416 template <
typename NumericT,
typename F>
419 typedef NumericT value_type;
421 unsigned int options_alpha = 0;
436 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
441 detail::cuda_arg<value_type>(NumericT(1)),
443 detail::cuda_arg<value_type>(mat),
444 static_cast<unsigned int>(mat_start),
445 static_cast<unsigned int>(mat_stride));
449 template <
typename NumericT,
typename F>
452 typedef NumericT value_type;
454 unsigned int options_alpha = 0;
469 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
474 detail::cuda_arg<value_type>(NumericT(1)),
476 detail::cuda_arg<value_type>(mat),
477 static_cast<unsigned int>(mat_start),
478 static_cast<unsigned int>(mat_stride));
488 template <
typename T,
typename F,
typename OP>
492 typedef T value_type;
494 unsigned int op_type = 2;
502 element_op_int_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
508 detail::cuda_arg<value_type>(proxy.lhs()),
513 detail::cuda_arg<value_type>(proxy.rhs()),
524 element_op_int_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
530 detail::cuda_arg<value_type>(proxy.lhs()),
535 detail::cuda_arg<value_type>(proxy.rhs()),
546 template <
typename F,
typename OP>
550 typedef float value_type;
552 unsigned int op_type = 2;
560 element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
566 detail::cuda_arg<value_type>(proxy.lhs()),
571 detail::cuda_arg<value_type>(proxy.rhs()),
582 element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
588 detail::cuda_arg<value_type>(proxy.lhs()),
593 detail::cuda_arg<value_type>(proxy.rhs()),
604 template <
typename F,
typename OP>
608 typedef double value_type;
610 unsigned int op_type = 2;
618 element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
624 detail::cuda_arg<value_type>(proxy.lhs()),
629 detail::cuda_arg<value_type>(proxy.rhs()),
640 element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
646 detail::cuda_arg<value_type>(proxy.lhs()),
651 detail::cuda_arg<value_type>(proxy.rhs()),
670 template <
typename T,
typename F>
674 typedef T value_type;
678 matrix_row_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
684 detail::cuda_arg<value_type>(proxy.lhs()),
693 matrix_col_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
699 detail::cuda_arg<value_type>(proxy.lhs()),
710 template <
typename T,
typename F>
714 typedef T value_type;
718 matrix_row_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
724 detail::cuda_arg<value_type>(proxy.lhs()),
733 matrix_col_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
739 detail::cuda_arg<value_type>(proxy.lhs()),
750 template <
typename T,
typename F>
754 typedef T value_type;
758 matrix_row_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
764 detail::cuda_arg<value_type>(proxy.lhs()),
773 matrix_col_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
779 detail::cuda_arg<value_type>(proxy.lhs()),
790 template <
typename T,
typename F>
794 typedef T value_type;
798 matrix_row_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
804 detail::cuda_arg<value_type>(proxy.lhs()),
813 matrix_col_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
819 detail::cuda_arg<value_type>(proxy.lhs()),
830 template <
typename T,
typename F>
834 typedef T value_type;
838 matrix_row_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
844 detail::cuda_arg<value_type>(proxy.lhs()),
853 matrix_col_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
859 detail::cuda_arg<value_type>(proxy.lhs()),
870 template <
typename T,
typename F>
874 typedef T value_type;
878 matrix_row_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
884 detail::cuda_arg<value_type>(proxy.lhs()),
893 matrix_col_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
899 detail::cuda_arg<value_type>(proxy.lhs()),
910 template <
typename T,
typename F>
914 typedef T value_type;
918 matrix_row_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
924 detail::cuda_arg<value_type>(proxy.lhs()),
933 matrix_col_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
939 detail::cuda_arg<value_type>(proxy.lhs()),
950 template <
typename T,
typename F>
954 typedef T value_type;
958 matrix_row_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
964 detail::cuda_arg<value_type>(proxy.lhs()),
973 matrix_col_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
979 detail::cuda_arg<value_type>(proxy.lhs()),
990 template <
typename T,
typename F>
994 typedef T value_type;
998 matrix_row_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1004 detail::cuda_arg<value_type>(proxy.lhs()),
1013 matrix_col_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1019 detail::cuda_arg<value_type>(proxy.lhs()),
1030 template <
typename T,
typename F>
1034 typedef T value_type;
1038 matrix_row_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1044 detail::cuda_arg<value_type>(proxy.lhs()),
1053 matrix_col_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1059 detail::cuda_arg<value_type>(proxy.lhs()),
1070 template <
typename T,
typename F>
1074 typedef T value_type;
1078 matrix_row_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1084 detail::cuda_arg<value_type>(proxy.lhs()),
1093 matrix_col_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1099 detail::cuda_arg<value_type>(proxy.lhs()),
1110 template <
typename T,
typename F>
1114 typedef T value_type;
1118 matrix_row_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1124 detail::cuda_arg<value_type>(proxy.lhs()),
1133 matrix_col_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1139 detail::cuda_arg<value_type>(proxy.lhs()),
1150 template <
typename T,
typename F>
1154 typedef T value_type;
1158 matrix_row_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1164 detail::cuda_arg<value_type>(proxy.lhs()),
1173 matrix_col_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1179 detail::cuda_arg<value_type>(proxy.lhs()),
1190 template <
typename T,
typename F>
1194 typedef T value_type;
1198 matrix_row_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1204 detail::cuda_arg<value_type>(proxy.lhs()),
1213 matrix_col_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1219 detail::cuda_arg<value_type>(proxy.lhs()),
1230 template <
typename T,
typename F>
1234 typedef T value_type;
1238 matrix_row_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1244 detail::cuda_arg<value_type>(proxy.lhs()),
1253 matrix_col_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1259 detail::cuda_arg<value_type>(proxy.lhs()),
1270 template <
typename T,
typename F>
1274 typedef T value_type;
1278 matrix_row_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1284 detail::cuda_arg<value_type>(proxy.lhs()),
1293 matrix_col_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1299 detail::cuda_arg<value_type>(proxy.lhs()),
1310 template <
typename T,
typename F>
1314 typedef T value_type;
1318 matrix_row_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1324 detail::cuda_arg<value_type>(proxy.lhs()),
1333 matrix_col_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1339 detail::cuda_arg<value_type>(proxy.lhs()),
1363 template <
typename NumericT,
typename F>
1368 typedef NumericT value_type;
1374 vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
1380 detail::cuda_arg<value_type>(vec),
1385 detail::cuda_arg<value_type>(result),
1394 vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
1400 detail::cuda_arg<value_type>(vec),
1405 detail::cuda_arg<value_type>(result),
1425 template <
typename NumericT,
typename F>
1433 typedef NumericT value_type;
1441 trans_vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat_trans.lhs()),
1447 detail::cuda_arg<value_type>(vec),
1452 detail::cuda_arg<value_type>(result),
1461 trans_vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat_trans.lhs()),
1467 detail::cuda_arg<value_type>(vec),
1472 detail::cuda_arg<value_type>(result),
1489 template <
typename T1,
typename T2,
typename T3,
typename ScalarType >
1491 const T2 & B,
bool transposed_B,
1498 cpu_value_type converted_alpha =
static_cast<cpu_value_type
>(alpha);
1499 cpu_value_type converted_beta =
static_cast<cpu_value_type
>(beta);
1501 dim3 threads(16, 16);
1510 if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
1512 matrix_matrix_col_col_col_prod_AA_kernel<<<grid, threads>>>
1514 detail::cuda_arg<cpu_value_type>(A),
1520 detail::cuda_arg<cpu_value_type>(B),
1527 detail::cuda_arg<cpu_value_type>(C),
1533 else if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
1535 matrix_matrix_col_col_col_prod_AT_kernel<<<grid, threads>>>
1537 detail::cuda_arg<cpu_value_type>(A),
1543 detail::cuda_arg<cpu_value_type>(B),
1550 detail::cuda_arg<cpu_value_type>(C),
1556 else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
1558 matrix_matrix_col_col_col_prod_TA_kernel<<<grid, threads>>>
1560 detail::cuda_arg<cpu_value_type>(A),
1566 detail::cuda_arg<cpu_value_type>(B),
1573 detail::cuda_arg<cpu_value_type>(C),
1579 else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
1581 matrix_matrix_col_col_col_prod_TT_kernel<<<grid, threads>>>
1583 detail::cuda_arg<cpu_value_type>(A),
1589 detail::cuda_arg<cpu_value_type>(B),
1596 detail::cuda_arg<cpu_value_type>(C),
1604 else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
1606 matrix_matrix_col_col_row_prod_AA_kernel<<<grid, threads>>>
1608 detail::cuda_arg<cpu_value_type>(A),
1614 detail::cuda_arg<cpu_value_type>(B),
1621 detail::cuda_arg<cpu_value_type>(C),
1627 else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
1629 matrix_matrix_col_col_row_prod_AT_kernel<<<grid, threads>>>
1631 detail::cuda_arg<cpu_value_type>(A),
1637 detail::cuda_arg<cpu_value_type>(B),
1644 detail::cuda_arg<cpu_value_type>(C),
1650 else if (!row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
1652 matrix_matrix_col_col_row_prod_TA_kernel<<<grid, threads>>>
1654 detail::cuda_arg<cpu_value_type>(A),
1660 detail::cuda_arg<cpu_value_type>(B),
1667 detail::cuda_arg<cpu_value_type>(C),
1673 else if (!row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
1675 matrix_matrix_col_col_row_prod_TT_kernel<<<grid, threads>>>
1677 detail::cuda_arg<cpu_value_type>(A),
1683 detail::cuda_arg<cpu_value_type>(B),
1690 detail::cuda_arg<cpu_value_type>(C),
1698 else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
1700 matrix_matrix_col_row_col_prod_AA_kernel<<<grid, threads>>>
1702 detail::cuda_arg<cpu_value_type>(A),
1708 detail::cuda_arg<cpu_value_type>(B),
1715 detail::cuda_arg<cpu_value_type>(C),
1721 else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
1723 matrix_matrix_col_row_col_prod_AT_kernel<<<grid, threads>>>
1725 detail::cuda_arg<cpu_value_type>(A),
1731 detail::cuda_arg<cpu_value_type>(B),
1738 detail::cuda_arg<cpu_value_type>(C),
1744 else if (!row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
1746 matrix_matrix_col_row_col_prod_TA_kernel<<<grid, threads>>>
1748 detail::cuda_arg<cpu_value_type>(A),
1754 detail::cuda_arg<cpu_value_type>(B),
1761 detail::cuda_arg<cpu_value_type>(C),
1767 else if (!row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
1769 matrix_matrix_col_row_col_prod_TT_kernel<<<grid, threads>>>
1771 detail::cuda_arg<cpu_value_type>(A),
1777 detail::cuda_arg<cpu_value_type>(B),
1784 detail::cuda_arg<cpu_value_type>(C),
1792 else if (!row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
1794 matrix_matrix_col_row_row_prod_AA_kernel<<<grid, threads>>>
1796 detail::cuda_arg<cpu_value_type>(A),
1802 detail::cuda_arg<cpu_value_type>(B),
1809 detail::cuda_arg<cpu_value_type>(C),
1815 else if (!row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
1817 matrix_matrix_col_row_row_prod_AT_kernel<<<grid, threads>>>
1819 detail::cuda_arg<cpu_value_type>(A),
1825 detail::cuda_arg<cpu_value_type>(B),
1832 detail::cuda_arg<cpu_value_type>(C),
1838 else if (!row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
1840 matrix_matrix_col_row_row_prod_TA_kernel<<<grid, threads>>>
1842 detail::cuda_arg<cpu_value_type>(A),
1848 detail::cuda_arg<cpu_value_type>(B),
1855 detail::cuda_arg<cpu_value_type>(C),
1861 else if (!row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
1863 matrix_matrix_col_row_row_prod_TT_kernel<<<grid, threads>>>
1865 detail::cuda_arg<cpu_value_type>(A),
1871 detail::cuda_arg<cpu_value_type>(B),
1878 detail::cuda_arg<cpu_value_type>(C),
1886 else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
1888 matrix_matrix_row_col_col_prod_AA_kernel<<<grid, threads>>>
1890 detail::cuda_arg<cpu_value_type>(A),
1896 detail::cuda_arg<cpu_value_type>(B),
1903 detail::cuda_arg<cpu_value_type>(C),
1909 else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
1911 matrix_matrix_row_col_col_prod_AT_kernel<<<grid, threads>>>
1913 detail::cuda_arg<cpu_value_type>(A),
1919 detail::cuda_arg<cpu_value_type>(B),
1926 detail::cuda_arg<cpu_value_type>(C),
1932 else if (row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
1934 matrix_matrix_row_col_col_prod_TA_kernel<<<grid, threads>>>
1936 detail::cuda_arg<cpu_value_type>(A),
1942 detail::cuda_arg<cpu_value_type>(B),
1949 detail::cuda_arg<cpu_value_type>(C),
1955 else if (row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
1957 matrix_matrix_row_col_col_prod_TT_kernel<<<grid, threads>>>
1959 detail::cuda_arg<cpu_value_type>(A),
1965 detail::cuda_arg<cpu_value_type>(B),
1972 detail::cuda_arg<cpu_value_type>(C),
1980 else if (row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
1982 matrix_matrix_row_col_row_prod_AA_kernel<<<grid, threads>>>
1984 detail::cuda_arg<cpu_value_type>(A),
1990 detail::cuda_arg<cpu_value_type>(B),
1997 detail::cuda_arg<cpu_value_type>(C),
2003 else if (row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
2005 matrix_matrix_row_col_row_prod_AT_kernel<<<grid, threads>>>
2007 detail::cuda_arg<cpu_value_type>(A),
2013 detail::cuda_arg<cpu_value_type>(B),
2020 detail::cuda_arg<cpu_value_type>(C),
2026 else if (row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
2028 matrix_matrix_row_col_row_prod_TA_kernel<<<grid, threads>>>
2030 detail::cuda_arg<cpu_value_type>(A),
2036 detail::cuda_arg<cpu_value_type>(B),
2043 detail::cuda_arg<cpu_value_type>(C),
2049 else if (row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
2051 matrix_matrix_row_col_row_prod_TT_kernel<<<grid, threads>>>
2053 detail::cuda_arg<cpu_value_type>(A),
2059 detail::cuda_arg<cpu_value_type>(B),
2066 detail::cuda_arg<cpu_value_type>(C),
2074 else if (row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
2076 matrix_matrix_row_row_col_prod_AA_kernel<<<grid, threads>>>
2078 detail::cuda_arg<cpu_value_type>(A),
2084 detail::cuda_arg<cpu_value_type>(B),
2091 detail::cuda_arg<cpu_value_type>(C),
2097 else if (row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
2099 matrix_matrix_row_row_col_prod_AT_kernel<<<grid, threads>>>
2101 detail::cuda_arg<cpu_value_type>(A),
2107 detail::cuda_arg<cpu_value_type>(B),
2114 detail::cuda_arg<cpu_value_type>(C),
2120 else if (row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
2122 matrix_matrix_row_row_col_prod_TA_kernel<<<grid, threads>>>
2124 detail::cuda_arg<cpu_value_type>(A),
2130 detail::cuda_arg<cpu_value_type>(B),
2137 detail::cuda_arg<cpu_value_type>(C),
2143 else if (row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
2145 matrix_matrix_row_row_col_prod_TT_kernel<<<grid, threads>>>
2147 detail::cuda_arg<cpu_value_type>(A),
2153 detail::cuda_arg<cpu_value_type>(B),
2160 detail::cuda_arg<cpu_value_type>(C),
2170 else if (row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
2172 matrix_matrix_row_row_row_prod_AA_kernel<<<grid, threads>>>
2174 detail::cuda_arg<cpu_value_type>(A),
2180 detail::cuda_arg<cpu_value_type>(B),
2187 detail::cuda_arg<cpu_value_type>(C),
2193 else if (row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
2195 matrix_matrix_row_row_row_prod_AT_kernel<<<grid, threads>>>
2197 detail::cuda_arg<cpu_value_type>(A),
2203 detail::cuda_arg<cpu_value_type>(B),
2210 detail::cuda_arg<cpu_value_type>(C),
2216 else if (row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
2218 matrix_matrix_row_row_row_prod_TA_kernel<<<grid, threads>>>
2220 detail::cuda_arg<cpu_value_type>(A),
2226 detail::cuda_arg<cpu_value_type>(B),
2233 detail::cuda_arg<cpu_value_type>(C),
2239 else if (row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
2241 matrix_matrix_row_row_row_prod_TT_kernel<<<grid, threads>>>
2243 detail::cuda_arg<cpu_value_type>(A),
2249 detail::cuda_arg<cpu_value_type>(B),
2256 detail::cuda_arg<cpu_value_type>(C),
2266 template <
typename T1,
typename T2,
typename T3,
typename ScalarType >
2272 std::string kernel_name)
2276 cpu_value_type cl_alpha =
static_cast<cpu_value_type
>(alpha);
2277 cpu_value_type cl_beta =
static_cast<cpu_value_type
>(beta);
2301 throw "not implemented yet";
2304 template <
typename T1,
typename T2,
typename T3,
typename ScalarType >
2305 void prod(
const T1 & A,
bool transposed_A,
2306 const T2 & B,
bool transposed_B,
2342 template <
typename NumericT,
typename F1,
typename F2,
typename F3,
typename ScalarType >
2371 template <
typename NumericT,
typename F1,
typename F2,
typename F3,
typename ScalarType >
2389 &&
bool(
"No direct inplace matrix-matrix product possible. Introduce a temporary!"));
2404 template <
typename NumericT,
typename F1,
typename F2,
typename F3,
typename ScalarType >
2428 template <
typename NumericT,
typename F1,
typename F2,
typename F3,
typename ScalarType >
2442 && bool(
"No direct inplace matrix-matrix product possible. Introduce a temporary!"));
2469 template <
typename NumericT,
typename F,
typename S1>
2471 S1
const & alpha,
vcl_size_t len_alpha,
bool reciprocal_alpha,
bool flip_sign_alpha,
2478 typedef NumericT value_type;
2480 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
2482 value_type temporary_alpha = 0;
2484 temporary_alpha = alpha;
2488 scaled_rank1_update_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
2497 detail::cuda_arg<value_type>(vec1),
2502 detail::cuda_arg<value_type>(vec2),
2511 scaled_rank1_update_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
2520 detail::cuda_arg<value_type>(vec1),
2525 detail::cuda_arg<value_type>(vec2),
std::size_t vcl_size_t
Definition: forwards.h:58
unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
Definition: common.hpp:37
result_of::size_type< matrix_base< NumericT, F > >::type stride2(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:68
Generic size and resize functionality for different vector and matrix types.
Common routines for CUDA execution.
Helper class for checking whether a matrix has a row-major layout.
Definition: forwards.h:399
viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar< T > &s, U)
Definition: common.hpp:127
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
void matrix_diagonal_assign(matrix_base< NumericT, F > &mat, NumericT s)
Definition: matrix_operations.hpp:289
A dense matrix class.
Definition: forwards.h:290
void prod(const T1 &A, bool transposed_A, const T2 &B, bool transposed_B, T3 &C, ScalarType alpha, ScalarType beta)
Definition: matrix_operations.hpp:2305
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:283
Implementations of row-major dense matrix related operations, including matrix-vector products...
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
void clear(VectorType &vec)
Generic routine for setting all entries of a vector to zero. This is the version for non-ViennaCL obj...
Definition: clear.hpp:57
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:64
Determines row and column increments for matrices and matrix proxies.
void prod_fast_kernel(const T1 &A, const T2 &B, T3 &C, ScalarType alpha, ScalarType beta, std::string kernel_name)
Definition: matrix_operations.hpp:2267
Implementations of column-major dense matrix related operations, including matrix-vector products...
void matrix_diag_from_vector(const vector_base< NumericT > &vec, int k, matrix_base< NumericT, F > &mat)
Definition: matrix_operations.hpp:318
void matrix_assign(matrix_base< NumericT, F > &mat, NumericT s, bool clear=false)
Definition: matrix_operations.hpp:257
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
result_of::size_type< matrix_base< NumericT, F > >::type stride1(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:57
void ambm_m(matrix_base< NumericT, F > &mat1, matrix_base< NumericT, F > const &mat2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, matrix_base< NumericT, F > const &mat3, ScalarType2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Definition: matrix_operations.hpp:182
void scaled_rank_1_update(matrix_base< NumericT, F > &mat1, S1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, const vector_base< NumericT > &vec1, const vector_base< NumericT > &vec2)
The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update...
Definition: matrix_operations.hpp:2470
void element_op(matrix_base< T, F > &A, matrix_expression< const matrix_base< T, F >, const matrix_base< T, F >, op_element_binary< OP > > const &proxy)
Definition: matrix_operations.hpp:489
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:83
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
Definition: forwards.h:363
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:43
Common base class for dense vectors, vector ranges, and vector slices.
Definition: forwards.h:205
Dense matrix-matrix product CUDA kernels reside here.
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
Definition: predicate.hpp:448
vcl_size_t internal_size2(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:287
Proxy classes for vectors.
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void matrix_diag_to_vector(const matrix_base< NumericT, F > &mat, int k, vector_base< NumericT > &vec)
Definition: matrix_operations.hpp:370
T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T type
Definition: result_of.hpp:276
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void prod_slow_kernel(const T1 &A, bool transposed_A, const T2 &B, bool transposed_B, T3 &C, ScalarType alpha, ScalarType beta)
Definition: matrix_operations.hpp:1490
A tag class representing transposed matrices.
Definition: forwards.h:165
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
Definition: forwards.h:86
vcl_size_t internal_size1(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc. ...
viennacl::backend::mem_handle & handle(T &obj)
Returns the generic memory handle of an object. Non-const version.
Definition: handle.hpp:41
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
Definition: predicate.hpp:418
void am(matrix_base< NumericT, F > &mat1, matrix_base< NumericT, F > const &mat2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
Definition: matrix_operations.hpp:58
void matrix_row(const matrix_base< NumericT, F > &mat, unsigned int i, vector_base< NumericT > &vec)
Definition: matrix_operations.hpp:417
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Definition: forwards.h:90
Implementation of the ViennaCL scalar class.
Implementations of vector operations using a plain single-threaded execution on CPU.
void ambm(matrix_base< NumericT, F > &mat1, matrix_base< NumericT, F > const &mat2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, matrix_base< NumericT, F > const &mat3, ScalarType2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Definition: matrix_operations.hpp:108
A collection of compile time type deductions.
void matrix_column(const matrix_base< NumericT, F > &mat, unsigned int j, vector_base< NumericT > &vec)
Definition: matrix_operations.hpp:450
void prod_impl(const matrix_base< NumericT, F > &mat, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
Definition: matrix_operations.hpp:1364
Simple enable-if variant that uses the SFINAE pattern.