Actual source code: fnexp.c
slepc-3.15.1 2021-05-28
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Exponential function exp(x)
12: */
14: #include <slepc/private/fnimpl.h>
15: #include <slepcblaslapack.h>
17: PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
18: {
20: *y = PetscExpScalar(x);
21: return(0);
22: }
24: PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
25: {
27: *y = PetscExpScalar(x);
28: return(0);
29: }
31: #define MAX_PADE 6
32: #define SWAP(a,b,t) {t=a;a=b;b=t;}
34: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
35: {
36: PetscErrorCode ierr;
37: PetscBLASInt n=0,ld,ld2,*ipiv,info,inc=1;
38: PetscInt m,j,k,sexp;
39: PetscBool odd;
40: const PetscInt p=MAX_PADE;
41: PetscReal c[MAX_PADE+1],s,*rwork;
42: PetscScalar scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
43: PetscScalar *Ba,*As,*A2,*Q,*P,*W,*aux;
44: const PetscScalar *Aa;
47: MatDenseGetArrayRead(A,&Aa);
48: MatDenseGetArray(B,&Ba);
49: MatGetSize(A,&m,NULL);
50: PetscBLASIntCast(m,&n);
51: ld = n;
52: ld2 = ld*ld;
53: P = Ba;
54: PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv);
55: PetscArraycpy(As,Aa,ld2);
57: /* Pade' coefficients */
58: c[0] = 1.0;
59: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
61: /* Scaling */
62: s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
63: PetscLogFlops(1.0*n*n);
64: if (s>0.5) {
65: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
66: scale = PetscPowRealInt(2.0,-sexp);
67: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
68: PetscLogFlops(1.0*n*n);
69: } else sexp = 0;
71: /* Horner evaluation */
72: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
73: PetscLogFlops(2.0*n*n*n);
74: PetscArrayzero(Q,ld2);
75: PetscArrayzero(P,ld2);
76: for (j=0;j<n;j++) {
77: Q[j+j*ld] = c[p];
78: P[j+j*ld] = c[p-1];
79: }
81: odd = PETSC_TRUE;
82: for (k=p-1;k>0;k--) {
83: if (odd) {
84: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
85: SWAP(Q,W,aux);
86: for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
87: odd = PETSC_FALSE;
88: } else {
89: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
90: SWAP(P,W,aux);
91: for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
92: odd = PETSC_TRUE;
93: }
94: PetscLogFlops(2.0*n*n*n);
95: }
96: /*if (odd) {
97: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
98: SWAP(Q,W,aux);
99: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
100: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
101: SlepcCheckLapackInfo("gesv",info);
102: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
103: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
104: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
105: } else {*/
106: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
107: SWAP(P,W,aux);
108: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
109: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
110: SlepcCheckLapackInfo("gesv",info);
111: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
112: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
113: /*}*/
114: PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);
116: for (k=1;k<=sexp;k++) {
117: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
118: PetscArraycpy(P,W,ld2);
119: }
120: if (P!=Ba) { PetscArraycpy(Ba,P,ld2); }
121: PetscLogFlops(2.0*n*n*n*sexp);
123: PetscFree6(Q,W,As,A2,rwork,ipiv);
124: MatDenseRestoreArrayRead(A,&Aa);
125: MatDenseRestoreArray(B,&Ba);
126: return(0);
127: }
129: /*
130: * Set scaling factor (s) and Pade degree (k,m)
131: */
132: static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt *s,PetscInt *k,PetscInt *m)
133: {
135: if (nrm>1) {
136: if (nrm<200) {*s = 4; *k = 5; *m = *k-1;}
137: else if (nrm<1e4) {*s = 4; *k = 4; *m = *k+1;}
138: else if (nrm<1e6) {*s = 4; *k = 3; *m = *k+1;}
139: else if (nrm<1e9) {*s = 3; *k = 3; *m = *k+1;}
140: else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
141: else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
142: else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
143: else {*s = 1; *k = 1; *m = *k+1;}
144: } else { /* nrm<1 */
145: if (nrm>0.5) {*s = 4; *k = 4; *m = *k-1;}
146: else if (nrm>0.3) {*s = 3; *k = 4; *m = *k-1;}
147: else if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
148: else if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
149: else if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
150: else if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
151: else if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
152: else if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
153: else {*s = 0; *k = 1; *m = 0;}
154: }
155: return(0);
156: }
158: #if defined(PETSC_HAVE_COMPLEX)
159: /*
160: * Partial fraction form coefficients.
161: * If query, the function returns the size necessary to store the coefficients.
162: */
163: static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
164: {
165: PetscInt i;
166: const PetscComplex /* m == k+1 */
167: p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
168: -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
169: 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
170: 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
171: -2.733432894659307e+02 },
172: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
173: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
174: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
175: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
176: 6.286704751729261e+00 },
177: p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
178: -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
179: 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
180: 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
181: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
182: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
183: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
184: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
185: p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
186: 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
187: -1.829749817484586e+01 },
188: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
189: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
190: 3.637834252744491e+00 },
191: p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
192: 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
193: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
194: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
195: const PetscComplex /* m == k-1 */
196: m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
197: -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
198: 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
199: 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
200: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
201: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
202: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
203: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
204: m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
205: 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
206: -1.734353918633177e+02 },
207: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
208: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
209: 5.648485971016893e+00 },
210: m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
211: 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
212: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
213: 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
214: const PetscScalar /* m == k-1 */
215: m1remain5[2] = { 2.000000000000000e-01, 9.800000000000000e+00},
216: m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
217: m1remain3[2] = { 3.333333333333333e-01, 5.666666666666667e+00},
218: m1remain2[2] = {-0.5, -3.5},
219: remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
220: remain2[3] = {1.0/2.0, 1, 1};
223: if (query) { /* query about buffer's size */
224: if (m==k+1) {
225: *remain = 0;
226: *r = *q = k+1;
227: return(0); /* quick return */
228: }
229: if (m==k-1) {
230: *remain = 2;
231: if (k==5) *r = *q = 4;
232: else if (k==4) *r = *q = 3;
233: else if (k==3) *r = *q = 2;
234: else if (k==2) *r = *q = 1;
235: }
236: if (m==0) {
237: *r = *q = 0;
238: *remain = k+1;
239: }
240: } else {
241: if (m==k+1) {
242: if (k==4) {
243: for (i=0;i<5;i++) { r[i] = p1r4[i]; q[i] = p1q4[i]; }
244: } else if (k==3) {
245: for (i=0;i<4;i++) { r[i] = p1r3[i]; q[i] = p1q3[i]; }
246: } else if (k==2) {
247: for (i=0;i<3;i++) { r[i] = p1r2[i]; q[i] = p1q2[i]; }
248: } else if (k==1) {
249: for (i=0;i<2;i++) { r[i] = p1r1[i]; q[i] = p1q1[i]; }
250: }
251: return(0); /* quick return */
252: }
253: if (m==k-1) {
254: if (k==5) {
255: for (i=0;i<4;i++) { r[i] = m1r5[i]; q[i] = m1q5[i]; }
256: for (i=0;i<2;i++) remain[i] = m1remain5[i];
257: } else if (k==4) {
258: for (i=0;i<3;i++) { r[i] = m1r4[i]; q[i] = m1q4[i]; }
259: for (i=0;i<2;i++) remain[i] = m1remain4[i];
260: } else if (k==3) {
261: for (i=0;i<2;i++) { r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i]; }
262: } else if (k==2) {
263: r[0] = -13.5; q[0] = 3;
264: for (i=0;i<2;i++) remain[i] = m1remain2[i];
265: }
266: }
267: if (m==0) {
268: r = q = 0;
269: if (k==3) {
270: for (i=0;i<4;i++) remain[i] = remain3[i];
271: } else if (k==2) {
272: for (i=0;i<3;i++) remain[i] = remain2[i];
273: }
274: }
275: }
276: return(0);
277: }
279: /*
280: * Product form coefficients.
281: * If query, the function returns the size necessary to store the coefficients.
282: */
283: static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
284: {
285: PetscInt i;
286: const PetscComplex /* m == k+1 */
287: p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
288: -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
289: -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
290: -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
291: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
292: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
293: 6.286704751729261e+00 ,
294: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
295: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
296: p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
297: -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
298: -5.648485971016893e+00 },
299: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
300: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
301: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
302: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
303: p1p2[2] = {-4.00000000000000e+00 + 2.000000000000000e+00*PETSC_i,
304: -4.00000000000000e+00 - 2.000000000000000e+00*PETSC_i},
305: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
306: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
307: 3.637834252744491e+00 },
308: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
309: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
310: const PetscComplex /* m == k-1 */
311: m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
312: -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
313: -6.286704751729261e+00 ,
314: -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
315: -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
316: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
317: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
318: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
319: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
320: m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
321: -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
322: -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
323: -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
324: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
325: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
326: 5.648485971016893e+00 },
327: m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
328: -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
329: -3.637834252744491e+00 },
330: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
331: 4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
332: m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
333: -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
336: if (query) {
337: if (m == k+1) {
338: *mult = 1;
339: *p = k;
340: *q = k+1;
341: return(0);
342: }
343: if (m==k-1) {
344: *mult = 1;
345: *p = k;
346: *q = k-1;
347: }
348: } else {
349: if (m == k+1) {
350: *mult = PetscPowInt(-1,m);
351: *mult *= m;
352: if (k==4) {
353: for (i=0;i<4;i++) { p[i] = p1p4[i]; q[i] = p1q4[i]; }
354: q[4] = p1q4[4];
355: } else if (k==3) {
356: for (i=0;i<3;i++) { p[i] = p1p3[i]; q[i] = p1q3[i]; }
357: q[3] = p1q3[3];
358: } else if (k==2) {
359: for (i=0;i<2;i++) { p[i] = p1p2[i]; q[i] = p1q2[i]; }
360: q[2] = p1q2[2];
361: } else if (k==1) {
362: p[0] = -3;
363: for (i=0;i<2;i++) q[i] = p1q1[i];
364: }
365: return(0);
366: }
367: if (m==k-1) {
368: *mult = PetscPowInt(-1,m);
369: *mult /= k;
370: if (k==5) {
371: for (i=0;i<4;i++) { p[i] = m1p5[i]; q[i] = m1q5[i]; }
372: p[4] = m1p5[4];
373: } else if (k==4) {
374: for (i=0;i<3;i++) { p[i] = m1p4[i]; q[i] = m1q4[i]; }
375: p[3] = m1p4[3];
376: } else if (k==3) {
377: for (i=0;i<2;i++) { p[i] = m1p3[i]; q[i] = m1q3[i]; }
378: p[2] = m1p3[2];
379: } else if (k==2) {
380: for (i=0;i<2;i++) p[i] = m1p2[i];
381: q[0] = 3;
382: }
383: }
384: }
385: return(0);
386: }
387: #endif /* PETSC_HAVE_COMPLEX */
389: #if defined(PETSC_USE_COMPLEX)
390: static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
391: {
392: PetscInt i;
395: *result=PETSC_TRUE;
396: for (i=0;i<n&&*result;i++) {
397: if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
398: }
399: return(0);
400: }
401: #endif
403: /*
404: * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
405: * and Yuji Nakatsukasa
406: *
407: * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
408: * Approximation for the Matrix Exponential",
409: * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
410: * https://doi.org/10.1137/15M1027553
411: */
412: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
413: {
414: #if !defined(PETSC_HAVE_COMPLEX)
416: SETERRQ(PETSC_COMM_SELF,1,"This function requires C99 or C++ complex support");
417: #else
418: PetscInt i,j,n_,s,k,m,mod;
419: PetscBLASInt n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,info,*piv,minlen,lwork=0,one=1;
420: PetscReal nrm,shift=0.0;
421: #if defined(PETSC_USE_COMPLEX) || defined(PETSC_HAVE_ESSL)
422: PetscReal *rwork=NULL;
423: #endif
424: PetscComplex *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
425: PetscScalar *Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*saux;
426: const PetscScalar *Aa;
427: PetscErrorCode ierr;
428: PetscBool isreal,flg;
429: #if defined(PETSC_HAVE_ESSL)
430: PetscScalar sdummy,*wri;
431: PetscBLASInt idummy,io=0;
432: #else
433: PetscBLASInt query=-1;
434: PetscScalar work1,*work;
435: #endif
438: MatGetSize(A,&n_,NULL);
439: PetscBLASIntCast(n_,&n);
440: MatDenseGetArrayRead(A,&Aa);
441: MatDenseGetArray(B,&Ba);
442: Ba2 = Ba;
443: PetscBLASIntCast(n*n,&n2);
445: PetscMalloc2(n2,&sMaux,n2,&Maux);
446: Maux2 = Maux;
447: PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg);
448: if (!flg) {
449: PetscMalloc2(n,&wr,n,&wi);
450: PetscArraycpy(sMaux,Aa,n2);
451: /* estimate rightmost eigenvalue and shift A with it */
452: #if !defined(PETSC_HAVE_ESSL)
453: #if !defined(PETSC_USE_COMPLEX)
454: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
455: SlepcCheckLapackInfo("geev",info);
456: PetscBLASIntCast((PetscInt)work1,&lwork);
457: PetscMalloc1(lwork,&work);
458: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
459: PetscFree(work);
460: #else
461: PetscArraycpy(Maux,Aa,n2);
462: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
463: SlepcCheckLapackInfo("geev",info);
464: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
465: PetscMalloc2(2*n,&rwork,lwork,&work);
466: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
467: PetscFree2(rwork,work);
468: #endif
469: SlepcCheckLapackInfo("geev",info);
470: #else /* defined(PETSC_HAVE_ESSL) */
471: PetscBLASIntCast(4*n,&lwork);
472: PetscMalloc2(lwork,&rwork,2*n,&wri);
473: #if !defined(PETSC_USE_COMPLEX)
474: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_(&io,sMaux,&n,wri,&sdummy,&idummy,&idummy,&n,rwork,&lwork));
475: for (i=0;i<n;i++) {
476: wr[i] = wri[2*i];
477: wi[i] = wri[2*i+1];
478: }
479: #else
480: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_(&io,Maux,&n,wri,&sdummy,&idummy,&idummy,&n,rwork,&lwork));
481: for (i=0;i<n;i++) wr[i] = wri[i];
482: #endif
483: PetscFree2(rwork,wri);
484: #endif
485: PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);
487: shift = PetscRealPart(wr[0]);
488: for (i=1;i<n;i++) {
489: if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
490: }
491: PetscFree2(wr,wi);
492: }
493: /* shift so that largest real part is (about) 0 */
494: PetscArraycpy(sMaux,Aa,n2);
495: if (shift) {
496: for (i=0;i<n;i++) sMaux[i+i*n] -= shift;
497: PetscLogFlops(1.0*n);
498: }
499: #if defined(PETSC_USE_COMPLEX)
500: PetscArraycpy(Maux,Aa,n2);
501: if (shift) {
502: for (i=0;i<n;i++) Maux[i+i*n] -= shift;
503: PetscLogFlops(1.0*n);
504: }
505: #endif
507: /* estimate norm(A) and select the scaling factor */
508: nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
509: PetscLogFlops(1.0*n*n);
510: sexpm_params(nrm,&s,&k,&m);
511: if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
512: if (shift) expshift = PetscExpReal(shift);
513: for (i=0;i<n;i++) sMaux[i+i*n] += 1.0;
514: if (shift) {
515: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
516: PetscLogFlops(1.0*(n+n2));
517: } else {
518: PetscLogFlops(1.0*n);
519: }
520: PetscArraycpy(Ba,sMaux,n2);
521: PetscFree2(sMaux,Maux);
522: MatDenseRestoreArrayRead(A,&Aa);
523: MatDenseRestoreArray(B,&Ba);
524: return(0); /* quick return */
525: }
527: PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv);
528: expmA2 = expmA; RR2 = RR;
529: /* scale matrix */
530: #if !defined(PETSC_USE_COMPLEX)
531: for (i=0;i<n2;i++) {
532: As[i] = sMaux[i];
533: }
534: #else
535: PetscArraycpy(As,sMaux,n2);
536: #endif
537: scale = 1.0/PetscPowRealInt(2.0,s);
538: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
539: SlepcLogFlopsComplex(1.0*n2);
541: /* evaluate Pade approximant (partial fraction or product form) */
542: if (fn->method==3 || !m) { /* partial fraction */
543: getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
544: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
545: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
546: PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
547: PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
548: getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);
550: PetscArrayzero(expmA,n2);
551: #if !defined(PETSC_USE_COMPLEX)
552: isreal = PETSC_TRUE;
553: #else
554: getisreal(n2,Maux,&isreal);
555: #endif
556: if (isreal) {
557: rsizediv2 = irsize/2;
558: for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
559: PetscArraycpy(Maux,As,n2);
560: PetscArrayzero(RR,n2);
561: for (j=0;j<n;j++) {
562: Maux[j+j*n] -= p[2*i];
563: RR[j+j*n] = r[2*i];
564: }
565: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
566: SlepcCheckLapackInfo("gesv",info);
567: for (j=0;j<n2;j++) {
568: expmA[j] += RR[j] + PetscConj(RR[j]);
569: }
570: /* loop(n) + gesv + loop(n2) */
571: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
572: }
574: mod = ipsize % 2;
575: if (mod) {
576: PetscArraycpy(Maux,As,n2);
577: PetscArrayzero(RR,n2);
578: for (j=0;j<n;j++) {
579: Maux[j+j*n] -= p[ipsize-1];
580: RR[j+j*n] = r[irsize-1];
581: }
582: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
583: SlepcCheckLapackInfo("gesv",info);
584: for (j=0;j<n2;j++) {
585: expmA[j] += RR[j];
586: }
587: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
588: }
589: } else { /* complex */
590: for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
591: PetscArraycpy(Maux,As,n2);
592: PetscArrayzero(RR,n2);
593: for (j=0;j<n;j++) {
594: Maux[j+j*n] -= p[i];
595: RR[j+j*n] = r[i];
596: }
597: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
598: SlepcCheckLapackInfo("gesv",info);
599: for (j=0;j<n2;j++) {
600: expmA[j] += RR[j];
601: }
602: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
603: }
604: }
605: for (i=0;i<iremainsize;i++) {
606: if (!i) {
607: PetscArrayzero(RR,n2);
608: for (j=0;j<n;j++) {
609: RR[j+j*n] = remainterm[iremainsize-1];
610: }
611: } else {
612: PetscArraycpy(RR,As,n2);
613: for (j=1;j<i;j++) {
614: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
615: SWAP(RR,Maux,aux);
616: SlepcLogFlopsComplex(2.0*n*n*n);
617: }
618: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
619: SlepcLogFlopsComplex(1.0*n2);
620: }
621: for (j=0;j<n2;j++) {
622: expmA[j] += RR[j];
623: }
624: SlepcLogFlopsComplex(1.0*n2);
625: }
626: PetscFree3(r,p,remainterm);
627: } else { /* product form, default */
628: getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
629: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
630: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
631: PetscMalloc2(irsize,&rootp,ipsize,&rootq);
632: getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);
634: PetscArrayzero(expmA,n2);
635: for (i=0;i<n;i++) { /* initialize */
636: expmA[i+i*n] = 1.0;
637: }
638: minlen = PetscMin(irsize,ipsize);
639: for (i=0;i<minlen;i++) {
640: PetscArraycpy(RR,As,n2);
641: for (j=0;j<n;j++) {
642: RR[j+j*n] -= rootp[i];
643: }
644: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
645: SWAP(expmA,Maux,aux);
646: PetscArraycpy(RR,As,n2);
647: for (j=0;j<n;j++) {
648: RR[j+j*n] -= rootq[i];
649: }
650: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
651: SlepcCheckLapackInfo("gesv",info);
652: /* loop(n) + gemm + loop(n) + gesv */
653: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
654: }
655: /* extra numerator */
656: for (i=minlen;i<irsize;i++) {
657: PetscArraycpy(RR,As,n2);
658: for (j=0;j<n;j++) {
659: RR[j+j*n] -= rootp[i];
660: }
661: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
662: SWAP(expmA,Maux,aux);
663: SlepcLogFlopsComplex(1.0*n+2.0*n*n*n);
664: }
665: /* extra denominator */
666: for (i=minlen;i<ipsize;i++) {
667: PetscArraycpy(RR,As,n2);
668: for (j=0;j<n;j++) RR[j+j*n] -= rootq[i];
669: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
670: SlepcCheckLapackInfo("gesv",info);
671: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
672: }
673: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
674: SlepcLogFlopsComplex(1.0*n2);
675: PetscFree2(rootp,rootq);
676: }
678: #if !defined(PETSC_USE_COMPLEX)
679: for (i=0;i<n2;i++) {
680: Ba2[i] = PetscRealPartComplex(expmA[i]);
681: }
682: #else
683: PetscArraycpy(Ba2,expmA,n2);
684: #endif
686: /* perform repeated squaring */
687: for (i=0;i<s;i++) { /* final squaring */
688: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
689: SWAP(Ba2,sMaux,saux);
690: PetscLogFlops(2.0*n*n*n);
691: }
692: if (Ba2!=Ba) {
693: PetscArraycpy(Ba,Ba2,n2);
694: sMaux = Ba2;
695: }
696: if (shift) {
697: expshift = PetscExpReal(shift);
698: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
699: PetscLogFlops(1.0*n2);
700: }
702: /* restore pointers */
703: Maux = Maux2; expmA = expmA2; RR = RR2;
704: PetscFree2(sMaux,Maux);
705: PetscFree4(expmA,As,RR,piv);
706: MatDenseRestoreArrayRead(A,&Aa);
707: MatDenseRestoreArray(B,&Ba);
708: return(0);
709: #endif
710: }
712: #define SMALLN 100
714: /*
715: * Function needed to compute optimal parameters (required workspace is 3*n*n)
716: */
717: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
718: {
719: PetscScalar *Ascaled=work;
720: PetscReal nrm,alpha,beta,rwork[1];
721: PetscInt t;
722: PetscBLASInt i,j;
726: beta = PetscPowReal(coeff,1.0/(2*m+1));
727: for (i=0;i<n;i++)
728: for (j=0;j<n;j++)
729: Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
730: nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
731: PetscLogFlops(2.0*n*n);
732: SlepcNormAm(n,Ascaled,2*m+1,work+n*n,rand,&alpha);
733: alpha /= nrm;
734: t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
735: PetscFunctionReturn(t);
736: }
738: /*
739: * Compute scaling parameter (s) and order of Pade approximant (m) (required workspace is 4*n*n)
740: */
741: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
742: {
743: PetscErrorCode ierr;
744: PetscScalar sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
745: PetscReal d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
746: PetscBLASInt n_=0,n2,one=1;
747: PetscRandom rand;
748: const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11, /* backward error function */
749: 2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
750: const PetscReal theta[5] = { 1.495585217958292e-002, /* m = 3 */
751: 2.539398330063230e-001, /* m = 5 */
752: 9.504178996162932e-001, /* m = 7 */
753: 2.097847961257068e+000, /* m = 9 */
754: 5.371920351148152e+000 }; /* m = 13 */
757: *s = 0;
758: *m = 13;
759: PetscBLASIntCast(n,&n_);
760: PetscRandomCreate(PETSC_COMM_SELF,&rand);
761: d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
762: if (d4==0.0) { /* safeguard for the case A = 0 */
763: *m = 3;
764: goto done;
765: }
766: d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
767: PetscLogFlops(2.0*n*n);
768: eta1 = PetscMax(d4,d6);
769: if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
770: *m = 3;
771: goto done;
772: }
773: if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
774: *m = 5;
775: goto done;
776: }
777: if (n<SMALLN) {
778: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
779: d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
780: PetscLogFlops(2.0*n*n*n+1.0*n*n);
781: } else {
782: SlepcNormAm(n_,Apowers[2],2,work,rand,&d8);
783: d8 = PetscPowReal(d8,1.0/8.0);
784: }
785: eta3 = PetscMax(d6,d8);
786: if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
787: *m = 7;
788: goto done;
789: }
790: if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
791: *m = 9;
792: goto done;
793: }
794: if (n<SMALLN) {
795: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
796: d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
797: PetscLogFlops(2.0*n*n*n+1.0*n*n);
798: } else {
799: SlepcNormAm(n_,Apowers[1],5,work,rand,&d10);
800: d10 = PetscPowReal(d10,1.0/10.0);
801: }
802: eta4 = PetscMax(d8,d10);
803: eta5 = PetscMin(eta3,eta4);
804: *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
805: if (*s) {
806: Ascaled = work+3*n*n;
807: n2 = n_*n_;
808: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
809: sfactor = PetscPowRealInt(2.0,-(*s));
810: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
811: PetscLogFlops(1.0*n*n);
812: } else Ascaled = A;
813: *s += ell(n_,Ascaled,coeff[4],13,work,rand);
814: done:
815: PetscRandomDestroy(&rand);
816: return(0);
817: }
819: /*
820: * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
821: *
822: * N. J. Higham, "The scaling and squaring method for the matrix exponential
823: * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
824: */
825: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
826: {
827: PetscErrorCode ierr;
828: PetscBLASInt n_=0,n2,*ipiv,info,one=1;
829: PetscInt n,m,j,s;
830: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
831: PetscScalar *Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
832: const PetscScalar *Aa,*c;
833: const PetscScalar c3[4] = { 120, 60, 12, 1 };
834: const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
835: const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
836: const PetscScalar c9[10] = { 17643225600.0, 8821612800.0, 2075673600, 302702400, 30270240,
837: 2162160, 110880, 3960, 90, 1 };
838: const PetscScalar c13[14] = { 64764752532480000.0, 32382376266240000.0, 7771770303897600.0,
839: 1187353796428800.0, 129060195264000.0, 10559470521600.0,
840: 670442572800.0, 33522128640.0, 1323241920.0,
841: 40840800, 960960, 16380, 182, 1 };
844: MatDenseGetArrayRead(A,&Aa);
845: MatDenseGetArray(B,&Ba);
846: MatGetSize(A,&n,NULL);
847: PetscBLASIntCast(n,&n_);
848: n2 = n_*n_;
849: PetscMalloc2(8*n*n,&work,n,&ipiv);
851: /* Matrix powers */
852: Apowers[0] = work; /* Apowers[0] = A */
853: Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
854: Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
855: Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
856: Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
858: PetscArraycpy(Apowers[0],Aa,n2);
859: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
860: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
861: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
862: PetscLogFlops(6.0*n*n*n);
864: /* Compute scaling parameter and order of Pade approximant */
865: expm_params(n,Apowers,&s,&m,Apowers[4]);
867: if (s) { /* rescale */
868: for (j=0;j<4;j++) {
869: scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
870: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
871: }
872: PetscLogFlops(4.0*n*n);
873: }
875: /* Evaluate the Pade approximant */
876: switch (m) {
877: case 3: c = c3; break;
878: case 5: c = c5; break;
879: case 7: c = c7; break;
880: case 9: c = c9; break;
881: case 13: c = c13; break;
882: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
883: }
884: P = Ba;
885: Q = Apowers[4] + n*n;
886: W = Q + n*n;
887: switch (m) {
888: case 3:
889: case 5:
890: case 7:
891: case 9:
892: if (m==9) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
893: PetscArrayzero(P,n2);
894: PetscArrayzero(Q,n2);
895: for (j=0;j<n;j++) {
896: P[j+j*n] = c[1];
897: Q[j+j*n] = c[0];
898: }
899: for (j=m;j>=3;j-=2) {
900: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
901: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
902: PetscLogFlops(4.0*n*n);
903: }
904: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
905: PetscLogFlops(2.0*n*n*n);
906: SWAP(P,W,aux);
907: break;
908: case 13:
909: /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
910: + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
911: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
912: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
913: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
914: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
915: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
916: PetscLogFlops(5.0*n*n+2.0*n*n*n);
917: PetscArrayzero(P,n2);
918: for (j=0;j<n;j++) P[j+j*n] = c[1];
919: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
920: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
921: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
922: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
923: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
924: PetscLogFlops(7.0*n*n+2.0*n*n*n);
925: /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
926: + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
927: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
928: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
929: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
930: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
931: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
932: PetscLogFlops(5.0*n*n+2.0*n*n*n);
933: PetscArrayzero(Q,n2);
934: for (j=0;j<n;j++) Q[j+j*n] = c[0];
935: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
936: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
937: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
938: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
939: PetscLogFlops(7.0*n*n);
940: break;
941: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
942: }
943: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
944: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
945: SlepcCheckLapackInfo("gesv",info);
946: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
947: for (j=0;j<n;j++) P[j+j*n] += 1.0;
948: PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);
950: /* Squaring */
951: for (j=1;j<=s;j++) {
952: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
953: SWAP(P,W,aux);
954: }
955: if (P!=Ba) { PetscArraycpy(Ba,P,n2); }
956: PetscLogFlops(2.0*n*n*n*s);
958: PetscFree2(work,ipiv);
959: MatDenseRestoreArrayRead(A,&Aa);
960: MatDenseRestoreArray(B,&Ba);
961: return(0);
962: }
964: #if defined(PETSC_HAVE_CUDA)
965: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
966: #include <slepccublas.h>
968: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDA(FN fn,Mat A,Mat B)
969: {
971: PetscBLASInt n=0,ld,ld2,*d_ipiv,*d_info,info,one=1,zero=0;
972: PetscInt m,k,sexp;
973: PetscBool odd;
974: const PetscInt p=MAX_PADE;
975: PetscReal c[MAX_PADE+1],s;
976: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
977: PetscScalar *Aa,*Ba;
978: PetscScalar *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux,**ppP,**d_ppP,**ppQ,**d_ppQ;
979: cublasHandle_t cublasv2handle;
980: cublasStatus_t cberr;
981: cudaError_t cerr;
984: PetscCUBLASGetHandle(&cublasv2handle);
985: MatDenseGetArray(A,&Aa);
986: MatDenseGetArray(B,&Ba);
987: MatGetSize(A,&m,NULL);
988: PetscBLASIntCast(m,&n);
989: ld = n;
990: ld2 = ld*ld;
992: cerr = cudaMalloc((void **)&d_Ba,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
993: cerr = cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
994: cerr = cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
995: cerr = cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
996: cerr = cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
997: cerr = cudaMalloc((void **)&d_ipiv,sizeof(PetscBLASInt)*ld);CHKERRCUDA(cerr);
998: cerr = cudaMalloc((void **)&d_info,sizeof(PetscBLASInt));CHKERRCUDA(cerr);
999: cerr = cudaMalloc((void **)&d_ppP,sizeof(PetscScalar*));CHKERRCUDA(cerr);
1000: cerr = cudaMalloc((void **)&d_ppQ,sizeof(PetscScalar*));CHKERRCUDA(cerr);
1002: PetscMalloc(sizeof(PetscScalar*),&ppP);
1003: PetscMalloc(sizeof(PetscScalar*),&ppQ);
1005: cerr = cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1006: cerr = cudaMemcpy(d_Ba,Ba,sizeof(PetscScalar)*ld2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1007: d_P = d_Ba;
1009: /* Pade' coefficients */
1010: c[0] = 1.0;
1011: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
1013: /* Scaling */
1014: cberr = cublasXnrm2(cublasv2handle,ld2,d_As,one,&s);CHKERRCUBLAS(cberr);
1015: if (s>0.5) {
1016: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
1017: scale = PetscPowRealInt(2.0,-sexp);
1018: cberr = cublasXscal(cublasv2handle,ld2,&scale,d_As,one);CHKERRCUBLAS(cberr);
1019: PetscLogFlops(1.0*n*n);
1020: } else sexp = 0;
1022: /* Horner evaluation */
1023: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld);CHKERRCUBLAS(cberr);
1024: PetscLogFlops(2.0*n*n*n);
1025: cerr = cudaMemset(d_Q,zero,sizeof(PetscScalar)*ld2);CHKERRCUDA(cerr);
1026: cerr = cudaMemset(d_P,zero,sizeof(PetscScalar)*ld2);CHKERRCUDA(cerr);
1027: set_diagonal(n,d_Q,ld,c[p]);CHKERRQ(cerr);
1028: set_diagonal(n,d_P,ld,c[p-1]);CHKERRQ(cerr);
1030: odd = PETSC_TRUE;
1031: for (k=p-1;k>0;k--) {
1032: if (odd) {
1033: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1034: SWAP(d_Q,d_W,aux);
1035: shift_diagonal(n,d_Q,ld,c[k-1]);CHKERRQ(cerr);
1036: odd = PETSC_FALSE;
1037: } else {
1038: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1039: SWAP(d_P,d_W,aux);
1040: shift_diagonal(n,d_P,ld,c[k-1]);CHKERRQ(cerr);
1041: odd = PETSC_TRUE;
1042: }
1043: PetscLogFlops(2.0*n*n*n);
1044: }
1045: if (odd) {
1046: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1047: SWAP(d_Q,d_W,aux);
1048: cberr = cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);
1050: ppQ[0] = d_Q;
1051: ppP[0] = d_P;
1052: cerr = cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1053: cerr = cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1055: cberr = cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one);CHKERRCUBLAS(cberr);
1056: cerr = cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1057: if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %d",PetscAbsInt(info));
1058: if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%d,%d) is zero",info,info);
1059: #if defined (CUDA_VERSION) && CUDA_VERSION >= 5050
1060: cberr = cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one);CHKERRCUBLAS(cberr);
1061: if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %d",PetscAbsInt(info));
1062: if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%d,%d) is zero",info,info);
1063: #else
1064: SETERRQ(communicator,PETSC_ERR_LIB,"cublasXgetrsBatched needs CUDA >= 7");
1065: #endif
1066: cberr = cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1067: shift_diagonal(n,d_P,ld,sone);CHKERRQ(cerr);
1068: cberr = cublasXscal(cublasv2handle,ld2,&smone,d_P,one);CHKERRCUBLAS(cberr);
1069: } else {
1070: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1071: SWAP(d_P,d_W,aux);
1072: cberr = cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);
1074: ppQ[0] = d_Q;
1075: ppP[0] = d_P;
1076: cerr = cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1077: cerr = cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1079: cberr = cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one);CHKERRCUBLAS(cberr);
1080: cerr = cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1081: if (info < 0) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "LAPACKgetrf: Illegal value on argument %d",PetscAbsInt(info));
1082: if (info > 0) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "LAPACKgetrf: Matrix is singular. U(%d,%d) is zero",info,info);
1083: #if defined (CUDA_VERSION) && CUDA_VERSION >= 5050
1084: cberr = cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one);CHKERRCUBLAS(cberr);
1085: if (info < 0) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "LAPACKgetri: Illegal value on argument %d",PetscAbsInt(info));
1086: if (info > 0) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "LAPACKgetri: Matrix is singular. U(%d,%d) is zero",info,info);
1087: #else
1088: SETERRQ(communicator,PETSC_ERR_LIB,"cublasXgetrsBatched needs CUDA >= 7");
1089: #endif
1090: cberr = cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1091: shift_diagonal(n,d_P,ld,sone);CHKERRQ(cerr);
1092: }
1093: PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);
1095: for (k=1;k<=sexp;k++) {
1096: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1097: cerr = cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1098: }
1099: if (d_P!=d_Ba) {
1100: cerr = cudaMemcpy(Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1101: } else {
1102: cerr = cudaMemcpy(Ba,d_Ba,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1103: }
1104: PetscLogFlops(2.0*n*n*n*sexp);
1106: cerr = cudaFree(d_Ba);CHKERRCUDA(cerr);
1107: cerr = cudaFree(d_Q);CHKERRCUDA(cerr);
1108: cerr = cudaFree(d_W);CHKERRCUDA(cerr);
1109: cerr = cudaFree(d_As);CHKERRCUDA(cerr);
1110: cerr = cudaFree(d_A2);CHKERRCUDA(cerr);
1111: cerr = cudaFree(d_ipiv);CHKERRCUDA(cerr);
1112: cerr = cudaFree(d_info);CHKERRCUDA(cerr);
1113: cerr = cudaFree(d_ppP);CHKERRCUDA(cerr);
1114: cerr = cudaFree(d_ppQ);CHKERRCUDA(cerr);
1116: PetscFree(ppP);
1117: PetscFree(ppQ);
1119: MatDenseRestoreArray(A,&Aa);
1120: MatDenseRestoreArray(B,&Ba);
1121: return(0);
1122: }
1124: #if defined(PETSC_HAVE_MAGMA)
1125: #include <slepcmagma.h>
1127: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDAm(FN fn,Mat A,Mat B)
1128: {
1130: PetscBLASInt n=0,ld,ld2,*piv,info,one=1,zero=0;
1131: PetscInt m,k,sexp;
1132: PetscBool odd;
1133: const PetscInt p=MAX_PADE;
1134: PetscReal c[MAX_PADE+1],s;
1135: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1136: PetscScalar *Aa,*Ba;
1137: PetscScalar *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux;
1138: cublasHandle_t cublasv2handle;
1139: cublasStatus_t cberr;
1140: cudaError_t cerr;
1141: magma_int_t mierr;
1144: PetscCUBLASGetHandle(&cublasv2handle);
1145: magma_init();
1146: MatDenseGetArray(A,&Aa);
1147: MatDenseGetArray(B,&Ba);
1148: MatGetSize(A,&m,NULL);
1149: PetscBLASIntCast(m,&n);
1150: ld = n;
1151: ld2 = ld*ld;
1153: cerr = cudaMalloc((void **)&d_Ba,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
1154: cerr = cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
1155: cerr = cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
1156: cerr = cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
1157: cerr = cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
1159: PetscMalloc(sizeof(PetscInt)*n,&piv);
1161: cerr = cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1162: cerr = cudaMemcpy(d_Ba,Ba,sizeof(PetscScalar)*ld2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1163: d_P = d_Ba;
1165: /* Pade' coefficients */
1166: c[0] = 1.0;
1167: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
1169: /* Scaling */
1170: cberr = cublasXnrm2(cublasv2handle,ld2,d_As,one,&s);CHKERRCUBLAS(cberr);
1171: PetscLogFlops(1.0*n*n);
1173: if (s>0.5) {
1174: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
1175: scale = PetscPowRealInt(2.0,-sexp);
1176: cberr = cublasXscal(cublasv2handle,ld2,&scale,d_As,one);CHKERRCUBLAS(cberr);
1177: PetscLogFlops(1.0*n*n);
1178: } else sexp = 0;
1180: /* Horner evaluation */
1181: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld);CHKERRCUBLAS(cberr);
1182: PetscLogFlops(2.0*n*n*n);
1183: cerr = cudaMemset(d_Q,zero,sizeof(PetscScalar)*ld2);CHKERRCUDA(cerr);
1184: cerr = cudaMemset(d_P,zero,sizeof(PetscScalar)*ld2);CHKERRCUDA(cerr);
1185: set_diagonal(n,d_Q,ld,c[p]);CHKERRQ(cerr);
1186: set_diagonal(n,d_P,ld,c[p-1]);CHKERRQ(cerr);
1188: odd = PETSC_TRUE;
1189: for (k=p-1;k>0;k--) {
1190: if (odd) {
1191: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1192: SWAP(d_Q,d_W,aux);
1193: shift_diagonal(n,d_Q,ld,c[k-1]);CHKERRQ(cerr);
1194: odd = PETSC_FALSE;
1195: } else {
1196: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1197: SWAP(d_P,d_W,aux);
1198: shift_diagonal(n,d_P,ld,c[k-1]);CHKERRQ(cerr);
1199: odd = PETSC_TRUE;
1200: }
1201: PetscLogFlops(2.0*n*n*n);
1202: }
1203: if (odd) {
1204: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1205: SWAP(d_Q,d_W,aux);
1206: cberr = cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);
1207: mmagma_xgesv_gpu(n,n,d_Q,ld,piv,d_P,ld,&info);CHKERRMAGMA(mierr);
1208: if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xGESV %d",info);
1209: cberr = cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1210: shift_diagonal(n,d_P,ld,sone);CHKERRQ(cerr);
1211: cberr = cublasXscal(cublasv2handle,ld2,&smone,d_P,one);CHKERRCUBLAS(cberr);
1212: } else {
1213: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1214: SWAP(d_P,d_W,aux);
1215: cberr = cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);
1216: mmagma_xgesv_gpu(n,n,d_Q,ld,piv,d_P,ld,&info);CHKERRMAGMA(mierr);
1217: if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xGESV %d",info);
1218: cberr = cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1219: shift_diagonal(n,d_P,ld,sone);CHKERRQ(cerr);
1220: }
1221: PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);
1223: for (k=1;k<=sexp;k++) {
1224: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1225: cerr = cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1226: }
1227: if (d_P!=d_Ba) {
1228: cerr = cudaMemcpy(Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1229: } else {
1230: cerr = cudaMemcpy(Ba,d_Ba,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1231: }
1232: PetscLogFlops(2.0*n*n*n*sexp);
1234: cerr = cudaFree(d_Ba);CHKERRCUDA(cerr);
1235: cerr = cudaFree(d_Q);CHKERRCUDA(cerr);
1236: cerr = cudaFree(d_W);CHKERRCUDA(cerr);
1237: cerr = cudaFree(d_As);CHKERRCUDA(cerr);
1238: cerr = cudaFree(d_A2);CHKERRCUDA(cerr);
1239: PetscFree(piv);
1241: MatDenseRestoreArray(A,&Aa);
1242: MatDenseRestoreArray(B,&Ba);
1243: magma_finalize();
1244: return(0);
1245: }
1247: /*
1248: * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
1249: *
1250: * N. J. Higham, "The scaling and squaring method for the matrix exponential
1251: * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
1252: */
1253: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham_CUDAm(FN fn,Mat A,Mat B)
1254: {
1255: PetscErrorCode ierr;
1256: PetscBLASInt n_=0,n2,*ipiv,info,one=1;
1257: PetscInt n,m,j,s,zero=0;
1258: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1259: PetscScalar *Aa,*Ba,*d_Ba,*Apowers[5],*d_Apowers[5],*d_Q,*d_P,*d_W,*work,*d_work,*aux;
1260: const PetscScalar *c;
1261: const PetscScalar c3[4] = { 120, 60, 12, 1 };
1262: const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
1263: const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
1264: const PetscScalar c9[10] = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
1265: 2162160, 110880, 3960, 90, 1 };
1266: const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
1267: 1187353796428800, 129060195264000, 10559470521600,
1268: 670442572800, 33522128640, 1323241920,
1269: 40840800, 960960, 16380, 182, 1 };
1270: cublasHandle_t cublasv2handle;
1271: cublasStatus_t cberr;
1272: cudaError_t cerr;
1273: magma_int_t mierr;
1276: PetscCUBLASGetHandle(&cublasv2handle);
1277: magma_init();
1278: MatDenseGetArray(A,&Aa);
1279: MatDenseGetArray(B,&Ba);
1280: MatGetSize(A,&n,NULL);
1281: PetscBLASIntCast(n,&n_);
1282: n2 = n_*n_;
1283: PetscMalloc2(8*n*n,&work,n,&ipiv);
1284: cudaMalloc((void**)&d_work,8*n*n*sizeof(PetscScalar));
1285: cerr = cudaMalloc((void **)&d_Ba,sizeof(PetscScalar)*n*n);CHKERRCUDA(cerr);
1286: cudaMemcpy(d_Ba,Ba,n2*sizeof(PetscScalar),cudaMemcpyHostToDevice);
1288: /* Matrix powers */
1289: Apowers[0] = work; /* Apowers[0] = A */
1290: Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
1291: Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
1292: Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
1293: Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
1294: /* Matrix powers on device */
1295: d_Apowers[0] = d_work; /* d_Apowers[0] = A */
1296: d_Apowers[1] = d_Apowers[0] + n*n; /* d_Apowers[1] = A^2 */
1297: d_Apowers[2] = d_Apowers[1] + n*n; /* d_Apowers[2] = A^4 */
1298: d_Apowers[3] = d_Apowers[2] + n*n; /* d_Apowers[3] = A^6 */
1299: d_Apowers[4] = d_Apowers[3] + n*n; /* d_Apowers[4] = A^8 */
1301: cudaMemcpy(d_Apowers[0],Aa,n2*sizeof(PetscScalar),cudaMemcpyHostToDevice);
1302: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_Apowers[0],n_,&szero,d_Apowers[1],n_);CHKERRCUBLAS(cberr);
1303: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[1],n_,&szero,d_Apowers[2],n_);CHKERRCUBLAS(cberr);
1304: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[2],n_,&szero,d_Apowers[3],n_);CHKERRCUBLAS(cberr);
1305: PetscLogFlops(6.0*n*n*n);
1307: cudaMemcpy(Apowers[0],d_Apowers[0],4*n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
1308: /* Compute scaling parameter and order of Pade approximant */
1309: expm_params(n,Apowers,&s,&m,Apowers[4]);
1311: if (s) { /* rescale */
1312: for (j=0;j<4;j++) {
1313: scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
1314: cberr = cublasXscal(cublasv2handle,n2,&scale,d_Apowers[j],one);CHKERRCUBLAS(cberr);
1315: }
1316: PetscLogFlops(4.0*n*n);
1317: }
1319: /* Evaluate the Pade approximant */
1320: switch (m) {
1321: case 3: c = c3; break;
1322: case 5: c = c5; break;
1323: case 7: c = c7; break;
1324: case 9: c = c9; break;
1325: case 13: c = c13; break;
1326: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
1327: }
1328: d_P = d_Ba;
1329: d_Q = d_Apowers[4] + n*n;
1330: d_W = d_Q + n*n;
1331: switch (m) {
1332: case 3:
1333: case 5:
1334: case 7:
1335: case 9:
1336: if (m==9) {cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[3],n_,&szero,d_Apowers[4],n_);CHKERRCUBLAS(cberr);}
1337: cerr = cudaMemset(d_P,zero,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1338: cerr = cudaMemset(d_Q,zero,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1339: set_diagonal(n,d_P,n,c[1]);CHKERRQ(cerr);
1340: set_diagonal(n,d_Q,n,c[0]);CHKERRQ(cerr);
1341: for (j=m;j>=3;j-=2) {
1342: cberr = cublasXaxpy(cublasv2handle,n2,&c[j],d_Apowers[(j+1)/2-1],one,d_P,one);CHKERRCUBLAS(cberr);
1343: cberr = cublasXaxpy(cublasv2handle,n2,&c[j-1],d_Apowers[(j+1)/2-1],one,d_Q,one);CHKERRCUBLAS(cberr);
1344: PetscLogFlops(4.0*n*n);
1345: }
1346: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_P,n_,&szero,d_W,n_);CHKERRCUBLAS(cberr);
1347: PetscLogFlops(2.0*n*n*n);
1348: SWAP(d_P,d_W,aux);
1349: break;
1350: case 13:
1351: /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
1352: + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
1353: cerr = cudaMemcpy(d_P,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1354: cberr = cublasXscal(cublasv2handle,n2,&c[13],d_P,one);CHKERRCUBLAS(cberr);
1355: cberr = cublasXaxpy(cublasv2handle,n2,&c[11],d_Apowers[2],one,d_P,one);CHKERRCUBLAS(cberr);
1356: cberr = cublasXaxpy(cublasv2handle,n2,&c[9],d_Apowers[1],one,d_P,one);CHKERRCUBLAS(cberr);
1357: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_P,n_,&szero,d_W,n_);CHKERRCUBLAS(cberr);
1358: PetscLogFlops(5.0*n*n+2.0*n*n*n);
1360: cerr = cudaMemset(d_P,zero,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1361: set_diagonal(n,d_P,n,c[1]);CHKERRQ(cerr);
1362: cberr = cublasXaxpy(cublasv2handle,n2,&c[7],d_Apowers[3],one,d_P,one);CHKERRCUBLAS(cberr);
1363: cberr = cublasXaxpy(cublasv2handle,n2,&c[5],d_Apowers[2],one,d_P,one);CHKERRCUBLAS(cberr);
1364: cberr = cublasXaxpy(cublasv2handle,n2,&c[3],d_Apowers[1],one,d_P,one);CHKERRCUBLAS(cberr);
1365: cberr = cublasXaxpy(cublasv2handle,n2,&sone,d_P,one,d_W,one);CHKERRCUBLAS(cberr);
1366: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_W,n_,&szero,d_P,n_);CHKERRCUBLAS(cberr);
1367: PetscLogFlops(7.0*n*n+2.0*n*n*n);
1368: /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
1369: + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
1370: cerr = cudaMemcpy(d_Q,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1371: cberr = cublasXscal(cublasv2handle,n2,&c[12],d_Q,one);CHKERRCUBLAS(cberr);
1372: cberr = cublasXaxpy(cublasv2handle,n2,&c[10],d_Apowers[2],one,d_Q,one);CHKERRCUBLAS(cberr);
1373: cberr = cublasXaxpy(cublasv2handle,n2,&c[8],d_Apowers[1],one,d_Q,one);CHKERRCUBLAS(cberr);
1374: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_Q,n_,&szero,d_W,n_);CHKERRCUBLAS(cberr);
1375: PetscLogFlops(5.0*n*n+2.0*n*n*n);
1376: cerr = cudaMemset(d_Q,zero,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1377: set_diagonal(n,d_Q,n,c[0]);CHKERRQ(cerr);
1378: cberr = cublasXaxpy(cublasv2handle,n2,&c[6],d_Apowers[3],one,d_Q,one);CHKERRCUBLAS(cberr);
1379: cberr = cublasXaxpy(cublasv2handle,n2,&c[4],d_Apowers[2],one,d_Q,one);CHKERRCUBLAS(cberr);
1380: cberr = cublasXaxpy(cublasv2handle,n2,&c[2],d_Apowers[1],one,d_Q,one);CHKERRCUBLAS(cberr);
1381: cberr = cublasXaxpy(cublasv2handle,n2,&sone,d_W,one,d_Q,one);CHKERRCUBLAS(cberr);
1382: PetscLogFlops(7.0*n*n);
1383: break;
1384: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
1385: }
1386: cberr = cublasXaxpy(cublasv2handle,n2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);
1388: mmagma_xgesv_gpu(n_,n_,d_Q,n_,ipiv,d_P,n_,&info);CHKERRMAGMA(mierr);
1389: if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xGESV %d",info);
1391: cberr = cublasXscal(cublasv2handle,n2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1392: shift_diagonal(n,d_P,n,sone);
1393: PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);
1395: /* Squaring */
1396: for (j=1;j<=s;j++) {
1397: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_P,n_,d_P,n_,&szero,d_W,n_);CHKERRCUBLAS(cberr);
1398: SWAP(d_P,d_W,aux);
1399: }
1400: if (d_P!=d_Ba) {
1401: cerr = cudaMemcpy(Ba,d_P,n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1402: } else {
1403: cerr = cudaMemcpy(Ba,d_Ba,n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1404: }
1405: PetscLogFlops(2.0*n*n*n*s);
1407: PetscFree2(work,ipiv);
1408: MatDenseRestoreArray(A,&Aa);
1409: MatDenseRestoreArray(B,&Ba);
1410: magma_finalize();
1411: return(0);
1412: }
1414: /*
1415: * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
1416: * and Yuji Nakatsukasa
1417: *
1418: * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade'
1419: * Approximation for the Matrix Exponential",
1420: * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
1421: * https://doi.org/10.1137/15M1027553
1422: */
1423: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm(FN fn,Mat A,Mat B)
1424: {
1425: PetscInt i,j,n_,s,k,m,zero=0,mod;
1426: PetscBLASInt n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,query=-1,info,*piv,minlen,lwork=0,one=1;
1427: PetscReal nrm,shift=0.0,rone=1.0,rzero=0.0;
1428: #if defined(PETSC_USE_COMPLEX)
1429: PetscReal *rwork=NULL;
1430: #endif
1431: PetscComplex *d_As,*d_RR,*d_RR2,*d_expmA,*d_expmA2,*d_Maux,*d_Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
1432: PetscScalar *Aa,*Ba,*d_Ba,*d_Ba2,*Maux,*sMaux,*d_sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
1434: PetscBool isreal,*d_isreal,flg;
1435: cublasHandle_t cublasv2handle;
1436: cudaError_t cerr;
1437: cublasStatus_t cberr;
1438: magma_int_t mierr;
1441: PetscCUBLASGetHandle(&cublasv2handle);
1442: magma_init();
1443: MatGetSize(A,&n_,NULL);
1444: PetscBLASIntCast(n_,&n);
1445: MatDenseGetArray(A,&Aa);
1446: MatDenseGetArray(B,&Ba);
1447: PetscBLASIntCast(n*n,&n2);
1449: cerr = cudaMalloc((void **)&d_Ba,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1450: cerr = cudaMemcpy(d_Ba,Ba,sizeof(PetscScalar)*n2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1451: d_Ba2 = d_Ba;
1453: PetscMalloc2(n2,&sMaux,n2,&Maux);
1454: cerr = cudaMalloc((void **)&d_isreal,sizeof(PetscBool));CHKERRCUDA(cerr);
1455: cerr = cudaMalloc((void **)&d_sMaux,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1456: cerr = cudaMalloc((void **)&d_Maux,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1458: cerr = cudaMemcpy(d_sMaux,Aa,sizeof(PetscScalar)*n2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1459: d_Maux2 = d_Maux;
1460: PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg);
1461: if (!flg) {
1462: PetscMalloc2(n,&wr,n,&wi);
1463: PetscArraycpy(sMaux,Aa,n2);
1464: /* estimate rightmost eigenvalue and shift A with it */
1465: #if !defined(PETSC_USE_COMPLEX)
1466: mmagma_xgeev(MagmaNoVec,MagmaNoVec,n,sMaux,n,wr,wi,NULL,n,NULL,n,&work1,query,&info);CHKERRMAGMA(mierr);
1467: SlepcCheckLapackInfo("geev",info);
1468: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
1469: PetscMalloc1(lwork,&work);
1470: mmagma_xgeev(MagmaNoVec,MagmaNoVec,n,sMaux,n,wr,wi,NULL,n,NULL,n,work,lwork,&info);CHKERRMAGMA(mierr);
1471: PetscFree(work);
1472: #else
1473: PetscArraycpy(Maux,Aa,n2);
1474: mmagma_xgeev(MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,&work1,query,rwork,&info);CHKERRMAGMA(mierr);
1475: SlepcCheckLapackInfo("geev",info);
1476: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
1477: PetscMalloc2(2*n,&rwork,lwork,&work);
1478: mmagma_xgeev(MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,work,lwork,rwork,&info);CHKERRMAGMA(mierr);
1479: PetscFree2(rwork,work);
1480: #endif
1481: SlepcCheckLapackInfo("geev",info);
1482: PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);
1484: shift = PetscRealPart(wr[0]);
1485: for (i=1;i<n;i++) {
1486: if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
1487: }
1488: PetscFree2(wr,wi);
1489: }
1490: /* shift so that largest real part is (about) 0 */
1491: cerr = cudaMemcpy(d_sMaux,Aa,sizeof(PetscScalar)*n2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1492: if (shift) {
1493: shift_diagonal(n,d_sMaux,n,-shift);
1494: PetscLogFlops(1.0*n);
1495: }
1496: #if defined(PETSC_USE_COMPLEX)
1497: cerr = cudaMemcpy(d_Maux,Aa,sizeof(PetscScalar)*n2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1498: if (shift) {
1499: shift_diagonal(n,d_Maux,n,-shift);
1500: PetscLogFlops(1.0*n);
1501: }
1502: #endif
1504: /* estimate norm(A) and select the scaling factor */
1505: cberr = cublasXnrm2(cublasv2handle,n2,d_sMaux,one,&nrm);CHKERRCUBLAS(cberr);
1506: PetscLogFlops(2.0*n*n);
1507: sexpm_params(nrm,&s,&k,&m);
1508: if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
1509: if (shift) expshift = PetscExpReal(shift);
1510: shift_Cdiagonal(n,d_Maux,n,rone,rzero);
1511: if (shift) {
1512: cberr = cublasXscal(cublasv2handle,n2,&expshift,d_sMaux,one);CHKERRCUBLAS(cberr);
1513: PetscLogFlops(1.0*(n+n2));
1514: } else {
1515: PetscLogFlops(1.0*n);
1516: }
1517: cerr = cudaMemcpy(Ba,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1518: cerr = cudaFree(d_sMaux);CHKERRCUDA(cerr);
1519: cerr = cudaFree(d_Ba);CHKERRCUDA(cerr);
1520: MatDenseRestoreArray(A,&Aa);
1521: MatDenseRestoreArray(B,&Ba);
1522: return(0); /* quick return */
1523: }
1525: cerr = cudaMalloc((void **)&d_expmA,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1526: cerr = cudaMalloc((void **)&d_As,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1527: cerr = cudaMalloc((void **)&d_RR,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1528: d_expmA2 = d_expmA; d_RR2 = d_RR;
1529: PetscMalloc1(n,&piv);
1530: /* scale matrix */
1531: #if !defined(PETSC_USE_COMPLEX)
1532: copy_array2D_S2C(n,n,d_As,n,d_sMaux,n);
1533: #else
1534: cerr = cudaMemcpy(d_As,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1535: #endif
1536: scale = 1.0/PetscPowRealInt(2.0,s);
1537: cberr = cublasXCscal(cublasv2handle,n2,(const cuComplex *)&scale,(cuComplex *)d_As,one);CHKERRCUBLAS(cberr);
1538: SlepcLogFlopsComplex(1.0*n2);
1540: /* evaluate Pade approximant (partial fraction or product form) */
1541: if (fn->method==8 || !m) { /* partial fraction */
1542: getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
1543: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
1544: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
1545: PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
1546: PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
1547: getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);
1549: cerr = cudaMemset(d_expmA,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1550: #if !defined(PETSC_USE_COMPLEX)
1551: isreal = PETSC_TRUE;
1552: #else
1553: getisreal_array2D(n,n,d_Maux,n,d_isreal);
1554: cerr = cudaMemcpy(&isreal,d_isreal,sizeof(PetscBool),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1555: #endif
1556: if (isreal) {
1557: rsizediv2 = irsize/2;
1558: for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
1559: cerr = cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1560: cerr = cudaMemset(d_RR,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1561: shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[2*i]),-PetscImaginaryPartComplex(p[2*i]));
1562: set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[2*i]),PetscImaginaryPartComplex(r[2*i]));
1563: mmagma_Cgesv_gpu(n,n,d_Maux,n,piv,d_RR,n,&info);CHKERRMAGMA(mierr);
1564: SlepcCheckLapackInfo("gesv",info);
1565: add_array2D_Conj(n,n,d_RR,n);
1566: cberr = cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);CHKERRCUBLAS(cberr);
1567: /* shift(n) + gesv + axpy(n2) */
1568: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
1569: }
1571: mod = ipsize % 2;
1572: if (mod) {
1573: cerr = cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1574: cerr = cudaMemset(d_RR,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1575: shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[ipsize-1]),-PetscImaginaryPartComplex(p[ipsize-1]));
1576: set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[irsize-1]),PetscImaginaryPartComplex(r[irsize-1]));
1577: mmagma_Cgesv_gpu(n,n,d_Maux,n,piv,d_RR,n,&info);CHKERRMAGMA(mierr);
1578: SlepcCheckLapackInfo("gesv",info);
1579: cberr = cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);CHKERRCUBLAS(cberr);
1580: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
1581: }
1582: } else { /* complex */
1583: for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
1584: cerr = cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1585: cerr = cudaMemset(d_RR,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1586: shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[i]),-PetscImaginaryPartComplex(p[i]));
1587: set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[i]),PetscImaginaryPartComplex(r[i]));
1588: mmagma_Cgesv_gpu(n,n,d_Maux,n,piv,d_RR,n,&info);CHKERRMAGMA(mierr);
1589: SlepcCheckLapackInfo("gesv",info);
1590: cberr = cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);CHKERRCUBLAS(cberr);
1591: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
1592: }
1593: }
1594: for (i=0;i<iremainsize;i++) {
1595: if (!i) {
1596: cerr = cudaMemset(d_RR,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1597: set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(remainterm[iremainsize-1]),PetscImaginaryPartComplex(remainterm[iremainsize-1]));
1598: } else {
1599: cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1600: for (j=1;j<i;j++) {
1601: cberr = cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_RR,n,&czero,d_Maux,n);CHKERRCUBLAS(cberr);
1602: SWAP(d_RR,d_Maux,aux);
1603: SlepcLogFlopsComplex(2.0*n*n*n);
1604: }
1605: cberr = cublasXCscal(cublasv2handle,n2,&remainterm[iremainsize-1-i],d_RR,one);CHKERRCUBLAS(cberr);
1606: SlepcLogFlopsComplex(1.0*n2);
1607: }
1608: cberr = cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);CHKERRCUBLAS(cberr);
1609: SlepcLogFlopsComplex(1.0*n2);
1610: }
1611: PetscFree3(r,p,remainterm);
1612: } else { /* product form, default */
1613: getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
1614: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
1615: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
1616: PetscMalloc2(irsize,&rootp,ipsize,&rootq);
1617: getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);
1619: cerr = cudaMemset(d_expmA,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1620: set_Cdiagonal(n,d_expmA,n,rone,rzero); /* initialize */
1621: minlen = PetscMin(irsize,ipsize);
1622: for (i=0;i<minlen;i++) {
1623: cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1624: shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i]));
1625: cberr = cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n);CHKERRCUBLAS(cberr);
1626: SWAP(d_expmA,d_Maux,aux);
1627: cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1628: shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i]));
1629: mmagma_Cgesv_gpu(n,n,d_RR,n,piv,d_expmA,n,&info);CHKERRMAGMA(mierr);
1630: SlepcCheckLapackInfo("gesv",info);
1631: /* shift(n) + gemm + shift(n) + gesv */
1632: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
1633: }
1634: /* extra enumerator */
1635: for (i=minlen;i<irsize;i++) {
1636: cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1637: shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i]));
1638: cberr = cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n);CHKERRCUBLAS(cberr);
1639: SWAP(d_expmA,d_Maux,aux);
1640: SlepcLogFlopsComplex(1.0*n+2.0*n*n*n);
1641: }
1642: /* extra denominator */
1643: for (i=minlen;i<ipsize;i++) {
1644: cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1645: shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i]));
1646: mmagma_Cgesv_gpu(n,n,d_RR,n,piv,d_expmA,n,&info);CHKERRMAGMA(mierr);
1647: SlepcCheckLapackInfo("gesv",info);
1648: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
1649: }
1650: cberr = cublasXCscal(cublasv2handle,n2,&mult,d_expmA,one);CHKERRCUBLAS(cberr);
1651: SlepcLogFlopsComplex(1.0*n2);
1652: PetscFree2(rootp,rootq);
1653: }
1655: #if !defined(PETSC_USE_COMPLEX)
1656: copy_array2D_C2S(n,n,d_Ba2,n,d_expmA,n);
1657: #else
1658: cerr = cudaMemcpy(d_Ba2,d_expmA,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1659: #endif
1661: /* perform repeated squaring */
1662: for (i=0;i<s;i++) { /* final squaring */
1663: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Ba2,n,d_Ba2,n,&szero,d_sMaux,n);CHKERRCUBLAS(cberr);
1664: SWAP(d_Ba2,d_sMaux,saux);
1665: PetscLogFlops(2.0*n*n*n);
1666: }
1667: if (d_Ba2!=d_Ba) {
1668: cerr = cudaMemcpy(d_Ba,d_Ba2,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1669: d_sMaux = d_Ba2;
1670: }
1671: if (shift) {
1672: expshift = PetscExpReal(shift);
1673: cberr = cublasXscal(cublasv2handle,n2,&expshift,d_Ba,one);CHKERRCUBLAS(cberr);
1674: PetscLogFlops(1.0*n2);
1675: }
1677: cerr = cudaMemcpy(Ba,d_Ba,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1679: /* restore pointers */
1680: d_Maux = d_Maux2; d_expmA = d_expmA2; d_RR = d_RR2;
1681: cerr = cudaFree(d_isreal);CHKERRCUDA(cerr);
1682: cerr = cudaFree(d_Maux);CHKERRCUDA(cerr);
1683: cerr = cudaFree(d_As);CHKERRCUDA(cerr);
1684: cerr = cudaFree(d_RR);CHKERRCUDA(cerr);
1685: cerr = cudaFree(d_expmA);CHKERRCUDA(cerr);
1686: cerr = cudaFree(d_Ba);CHKERRCUDA(cerr);
1687: PetscFree(piv);
1688: PetscFree2(sMaux,Maux);
1689: MatDenseRestoreArray(A,&Aa);
1690: MatDenseRestoreArray(B,&Ba);
1691: magma_finalize();
1692: return(0);
1693: }
1694: #endif /* PETSC_HAVE_MAGMA */
1695: #endif /* PETSC_HAVE_CUDA */
1697: PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
1698: {
1700: PetscBool isascii;
1701: char str[50];
1702: const char *methodname[] = {
1703: "scaling & squaring, [m/m] Pade approximant (Higham)",
1704: "scaling & squaring, [6/6] Pade approximant",
1705: "scaling & squaring, subdiagonal Pade approximant (product form)",
1706: "scaling & squaring, subdiagonal Pade approximant (partial fraction)"
1707: #if defined(PETSC_HAVE_CUDA)
1708: ,"scaling & squaring, [6/6] Pade approximant CUDA"
1709: #if defined(PETSC_HAVE_MAGMA)
1710: ,"scaling & squaring, [m/m] Pade approximant (Higham) CUDA/MAGMA",
1711: "scaling & squaring, [6/6] Pade approximant CUDA/MAGMA",
1712: "scaling & squaring, subdiagonal Pade approximant (product form) CUDA/MAGMA",
1713: "scaling & squaring, subdiagonal Pade approximant (partial fraction) CUDA/MAGMA",
1714: #endif
1715: #endif
1716: };
1717: const int nmeth=sizeof(methodname)/sizeof(methodname[0]);
1720: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
1721: if (isascii) {
1722: if (fn->beta==(PetscScalar)1.0) {
1723: if (fn->alpha==(PetscScalar)1.0) {
1724: PetscViewerASCIIPrintf(viewer," Exponential: exp(x)\n");
1725: } else {
1726: SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
1727: PetscViewerASCIIPrintf(viewer," Exponential: exp(%s*x)\n",str);
1728: }
1729: } else {
1730: SlepcSNPrintfScalar(str,sizeof(str),fn->beta,PETSC_TRUE);
1731: if (fn->alpha==(PetscScalar)1.0) {
1732: PetscViewerASCIIPrintf(viewer," Exponential: %s*exp(x)\n",str);
1733: } else {
1734: PetscViewerASCIIPrintf(viewer," Exponential: %s",str);
1735: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1736: SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
1737: PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str);
1738: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1739: }
1740: }
1741: if (fn->method<nmeth) {
1742: PetscViewerASCIIPrintf(viewer," computing matrix functions with: %s\n",methodname[fn->method]);
1743: }
1744: }
1745: return(0);
1746: }
1748: SLEPC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
1749: {
1751: fn->ops->evaluatefunction = FNEvaluateFunction_Exp;
1752: fn->ops->evaluatederivative = FNEvaluateDerivative_Exp;
1753: fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1754: fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1755: fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* product form */
1756: fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* partial fraction */
1757: #if defined(PETSC_HAVE_CUDA)
1758: fn->ops->evaluatefunctionmat[4] = FNEvaluateFunctionMat_Exp_Pade_CUDA;
1759: #if defined(PETSC_HAVE_MAGMA)
1760: fn->ops->evaluatefunctionmat[5] = FNEvaluateFunctionMat_Exp_Higham_CUDAm;
1761: fn->ops->evaluatefunctionmat[6] = FNEvaluateFunctionMat_Exp_Pade_CUDAm;
1762: fn->ops->evaluatefunctionmat[7] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* product form */
1763: fn->ops->evaluatefunctionmat[8] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* partial fraction */
1764: #endif
1765: #endif
1766: fn->ops->view = FNView_Exp;
1767: return(0);
1768: }