/*
    main.cpp   entry point for jkl-ecm

    Copyright (C) 2015-2017  Oisin Robinson

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
*/

// Created on 18 February 2015, 21:07


#ifndef __GMP_PLUSPLUS__
#include <gmpxx.h>
#endif
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <string>
#include <cstring>
#include <vector>
#include <ctime>
#include <stdint.h>
#include "jklecmdefs.h"
#include <stack>
#include <cmath>

using namespace std;

void GetlcmScalar(uint64_t, mpz_t);
void mpz_set_ull(mpz_t, unsigned long long);
void PolyEval(mpz_t*, mpz_t*, long*, long*, mpz_t*, mpz_t*, long, long, mpz_t, mpz_t);
long NumProductTreeNodes(long);
long GetPolyProductInfo(long, long, long, long*, long*);
long getPolyDegree(mpz_t*, long);
void PolyIntProductTree(mpz_t*, mpz_t*, long*, long, mpz_t, long);
void PolyIntProductTree2(mpz_t*, long*, mpz_t*, long, mpz_t, long, long);
void PolyReciprocal(mpz_t*, uint64_t, mpz_t, mpz_t*);
void PolyIntProduct(mpz_t*, mpz_t*, uint64_t, uint64_t, mpz_t, mpz_t*);
void TreePolyIntProduct(mpz_t*, mpz_t*, long, uint64_t, uint64_t, mpz_t, mpz_t*);
void jklecm(mpz_t, uint64_t, mpz_t, mpz_t, long, long, long, bool, long, long, bool, long, long, bool);
bool stage2(mpz_t, long, long, int, int, mpz_t, int, bool, mpz_t, mpz_t, epoint*, bool);
void ScalarMultiplyHessian(mpz_t, mpz_t, epoint*, epoint*, mpz_t, mpz_t);
void HEnAdd(mpz_t, mpz_t, epoint*, epoint*, epoint*, mpz_t);
void getstr(mpz_t);
void PolyReduceModF(mpz_t*, long, mpz_t*, long, long, mpz_t, mpz_t*, mpz_t*);

// Global variables
string** curvedata1;
string** curvedata3;

/*
 * 
 */
int main(int argc, char** argv) {

	////Turn on debugging for memory leaks. This is automatically turned off when the build is Release.
	//_CrtSetReportMode(_CRT_WARN, _CRTDBG_MODE_FILE);
	//_CrtSetReportFile(_CRT_WARN, _CRTDBG_FILE_STDOUT);
	//_CrtSetReportMode(_CRT_ERROR, _CRTDBG_MODE_FILE);
	//_CrtSetReportFile(_CRT_ERROR, _CRTDBG_FILE_STDOUT);
	//_CrtSetReportMode(_CRT_ASSERT, _CRTDBG_MODE_FILE);
	//_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDOUT);

    mpz_t n, B2;
    mpz_init(n); mpz_init(B2);
    long B1;
    
    //char *workingDir =_getcwd(NULL, 0);
    //cout << workingDir << "\n";
    
    //for (int i = 0; i < argc; i++) {
    //    cout << argv[i] << "\n";
    //}
    
    if (argc <= 1) {
		cout << "\nUsage:\n";
		cout << "  jklecm B1 B2 k type startc numc v c < file\n";
		cout << "\n";
		cout << "Parameters:\n";
		cout << "  B1        stage 1 bound (default 1000)\n";
		cout << "  B2        stage 2 bound (default B1*100)\n";       
		cout << "  k         number of stage 2 blocks (default 3)\n";
		cout << "  type      use Edwards curves (E) or Hessian curves (H)\n";
		cout << "  startc    curve number to start with\n";
		cout << "  numc      maximum number of curves to try\n";
        cout << "  v         verbose output unless v omitted \n";
		cout << "  c         continue factoring if cofactor composite\n";
		cout << "  file      text file containing number to factor.\n";
        cout << "            Alternatively, omit <, press enter, type an integer, and enter again.\n";
        cout << "\n";
        cout << "Example:    jklecm 100000 4000000000 3 H 1 10 v c < input.txt\n";
        cout << "\n";

        return 1;
    }

	if (*argv) ++argv;

	B1 = 1000;
	if (*argv) B1 = atoi(*argv++);

	mpz_set_ui(B2, B1*100);
	if (*argv) mpz_set_str(B2, *argv++, 10);

	long k = 3;
	if (*argv) k = atoi(*argv++);

	char EH = 'E';
	if (*argv) EH = **argv++;
    bool Hessian = (EH == 'E' ? false:true);

	long startc = 1;
	if (*argv) startc = atoi(*argv++);
	if (EH == 'E' && startc > 700) startc = 700;
	if (EH == 'H' && startc > 5000) startc = 5000;

	long numc = (EH == 'E' ? 700:5000);
	if (*argv) numc = atoi(*argv++);
	if (EH == 'E' && numc > 701 - startc) numc = 701 - startc;
	if (EH == 'H' && numc > 5001 - startc) numc = 5001 - startc;

	bool verbose = false;
	char v = 'v';
	if (*argv) {
		v = **argv++;
		verbose = (v == 'v' ? true:false);
	}
    
    bool Continue = false;
	char c2 = 'c';
	if (*argv) {
		c2 = **argv++;
		Continue = (c2 == 'c' ? true:false);
	}
    
    // load curve data from curvedata1.txt
    int numE = 0;
    int c;
    string line;
    ifstream file1("curvedata1.txt");
    getline(file1, line);
    numE = atoi(line.c_str());
    curvedata1 = new string*[5];
    for (int i = 0; i < 5; i++) curvedata1[i] = new string[numE];
    for (c = 0; c < numE; c++) {
        getline(file1, line);
        vector<string> s = split(line, ',');
        for (int i = 0; i < 5; i++) 
            curvedata1[i][c] = s[i];
    }
    file1.close();

    // load curve data from curvedata3.txt
    int numH = 0;
    line;
    ifstream file2("curvedata3.txt");
    getline(file2, line);
    numH = atoi(line.c_str());
    curvedata3 = new string*[5];
    for (int i = 0; i < 5; i++) curvedata3[i] = new string[numH];
    for (c = 0; c < numH; c++) {
        getline(file2, line);
        vector<string> s = split(line, ',');
        for (int i = 0; i < 5; i++) 
            curvedata3[i][c] = s[i];
    }
    file2.close();

	string inputline;

	//while (getline(cin, inputline) && !inputline.empty()) {
    //while (cin >> inputline) {
    cin >> inputline;
        if (inputline[0] != '#' && inputline[0] != 13) {
            mpz_set_str(n, inputline.c_str(), 10);	

            cout << "Number to factor: \n" << mpz_class(n).get_str(10) << " (" << mpz_class(n).get_str(10).length() << " digits)\n";

            // print program (effective) parameters
            long smallprimes[10] = { 2,3,5,7,11,13,17,19,23,29 };

            mpz_t target_range; mpz_init_set(target_range, B2);
            // for a given d1, B2 = i_max * d1 + j_max
            // 0 <= i <= k * deg(G)
            // 1 <= j <= deg(F)
            // deg(F) = phi(d1)/2
            // deg(G) = deg(F) - 1
            // so B2 = k * (phi(d1)/2 - 1) * d1 + phi(d1)/2
            mpz_t range; mpz_init_set_ui(range, k*(8/2-1)*30+8/2);
            mpz_t test_range; mpz_init(test_range);
            mpz_t diff1, diff2; mpz_init(diff1); mpz_init(diff2);
            long d1 = 30;  // we always have 30 | d1 as explained in stage 2
            long phi = 8;
            for (int bits = 0; bits < 1<<7; bits++) {
                long test_d1 = 30;
                long test_phi = 8;
                for (int b = 0; b < 7; b++) {
                    if ((bits & (1<<b)) > 0) {  // there are 7 "free" primes, we try every combination
                        test_d1 *= smallprimes[3 + b];
                        test_phi *= smallprimes[3 + b] - 1;
                    }
                }
                mpz_set_ui(test_range, test_phi/2-1);
                mpz_mul_ui(test_range, test_range, k);
                mpz_mul_ui(test_range, test_range, test_d1);
                mpz_add_ui(test_range, test_range, test_phi/2);
                mpz_sub(diff1, test_range, target_range);
                mpz_abs(diff1, diff1);
                mpz_sub(diff2, range, target_range);
                mpz_abs(diff2, diff2);
                // we are looking for the value of k * (phi(d1)/2 - 1) * d1 + phi(d1)/2
                // which exceeds requested B2, but is closest to it.
                if (mpz_cmp(diff1, diff2) < 0 && mpz_cmp(test_range, target_range) >= 0) {
                    mpz_set(range, test_range);
                    d1 = test_d1;
                    phi = test_phi;
                }
            }
            // now we have the best values in range, d1 and phi
            long numj = phi/2;  // number of roots of stage 2 polynomial
            mpz_t B2dash; mpz_init_set(B2dash, range);

            jklecm(n, B1, B2, B2dash, k, d1, numj, Hessian, numE, numH, verbose, startc-1, numc, Continue);

            mpz_clear(B2dash);
            mpz_clear(diff2);
            mpz_clear(diff1);
            mpz_clear(test_range);
            mpz_clear(range);
            mpz_clear(target_range);                
        }

        //inputline.clear();
	//}
    
    for (int i = 0; i < 5; i++) {
        delete[] curvedata1[i];
		delete[] curvedata3[i];
	}
	delete[] curvedata1;
	delete[] curvedata3;
	
	mpz_clear(B2); mpz_clear(n);

    return 0;
}


/*
 */
void GetlcmScalar(uint64_t B, mpz_t S) {

    // Sieve of Eratosthenes
    uint64_t i;
    char* sieve = new char[B+1]();   // zero-initialized
    for (i = 2; i <= B; i++) {
        if(!sieve[i]) for (uint64_t j = i * 2; j <= B; j += i) sieve[j] = 1;
    }
    // count number of primes (correspond to 0) in sieve 
    int n = 0;
    for (i = 2; i <= B; i++) {
		if (!sieve[i]) n++;
	}
    
    mpz_t* tree = new mpz_t[n];
    
    // Construct product tree
    n = 0;
    mpz_t ie; mpz_init(ie); mpz_t iei; mpz_init(iei);
    for (i = 2; i <= B; i++) {
        if (!sieve[i]) {
            // i is prime. Set tree[n] = i^e, such that i^e < B < i^(e+1)
            mpz_set_ui(ie, i);
            mpz_mul_ui(iei, ie, i);
            while (mpz_cmp_ui(iei, B) < 0) { mpz_set(ie, iei); mpz_mul_ui(iei, ie, i); }
            mpz_init(tree[n]);
            mpz_set(tree[n], ie);
            n++;
        }
    }
    delete[] sieve; mpz_clear(ie); mpz_clear(iei);
    
    // Coalesce product tree
    uint64_t treepos = n - 1;
    while (treepos > 0) {
        for (i = 0; i <= treepos; i += 2) {
            if(i < treepos)
				mpz_lcm(tree[i/2], tree[i],tree[i + 1]);
            else
				mpz_set(tree[i/2], tree[i]);
        }
        for (i = (treepos >> 1); i < treepos - 1; i++) mpz_set_ui(tree[i + 1], 1);
        treepos = treepos >> 1;
    }
    // tree[0] is the lcm of all primes with powers bounded by B
    mpz_set(S, tree[0]);
        
    for (i = 0; i < n; i++) mpz_clear(tree[i]);
    delete[] tree;
}

void EEnAdd(mpz_t d, mpz_t a, epoint* P, epoint* Q, epoint* R, mpz_t n) {
    mpz_t A, B, B3, C, dC, D, CaD, E, F, AF, G, AG, aC, DmaC;
    mpz_t X0aY0, X0aY0xB, X0aY0xB_mCmD, X, Y, Z, mulmod;
    
    mpz_init(A); mpz_init(B);  mpz_init(B3); mpz_init(C); mpz_init(D);  mpz_init(dC);
    mpz_init(CaD); mpz_init(E); mpz_init(F); mpz_init(AF);
    mpz_init(G); mpz_init(AG); mpz_init(aC); mpz_init(DmaC);
    mpz_init(X0aY0); mpz_init(X0aY0xB); mpz_init(X0aY0xB_mCmD);
    mpz_init(X); mpz_init(Y); mpz_init(Z); mpz_init(mulmod);
 
    // add
    mpz_mul(mulmod, P->z, Q->z); mpz_mod(A, mulmod, n);
    mpz_add(B, Q->x, Q->y);
    mpz_mul(mulmod, A, A); mpz_mod(B3, mulmod, n);
    mpz_mul(mulmod, P->x, Q->x); mpz_mod(C, mulmod, n);
    mpz_mul(mulmod, P->y, Q->y); mpz_mod(D, mulmod, n);
    mpz_mul(dC, d, C);
    mpz_add(CaD, C, D);
    mpz_mul(mulmod, dC, D); mpz_mod(E, mulmod, n);
    mpz_sub(F, B3, E);
    mpz_add(G, B3, E);
    mpz_add(X0aY0, P->x, P->y);
    mpz_mul(mulmod, X0aY0, B); mpz_mod(X0aY0xB, mulmod, n);
    mpz_sub(X0aY0xB_mCmD, X0aY0xB, CaD);
    mpz_mul(mulmod, A, F); mpz_mod(AF, mulmod, n);
    mpz_mul(X, AF, X0aY0xB_mCmD);
    mpz_mul(mulmod, A, G); mpz_mod(AG, mulmod, n);
    mpz_mul(aC, a, C);
    mpz_sub(DmaC, D, aC);
    mpz_mul(Y, AG, DmaC);
    mpz_mul(Z, F, G);
    mpz_mod(R->x, X, n);
    mpz_mod(R->y, Y, n);
    mpz_mod(R->z, Z, n); 
    
    mpz_clear(A); mpz_clear(B);  mpz_clear(B3); mpz_clear(C); mpz_clear(D);  mpz_clear(dC);
    mpz_clear(CaD); mpz_clear(E); mpz_clear(F); mpz_clear(AF);
    mpz_clear(G); mpz_clear(AG); mpz_clear(aC); mpz_clear(DmaC);
    mpz_clear(X0aY0); mpz_clear(X0aY0xB); mpz_clear(X0aY0xB_mCmD);
    mpz_clear(X); mpz_clear(Y); mpz_clear(Z); mpz_clear(mulmod);
}

/* ScalarMultiplyEdwards
 * 
 * Multiply a point [X0:Y0:Z0] on a twisted edwards curve by a scalar multiple
 * d	d parameter of twisted Edwards curve
 * a	a parameter of twisted Edwards curve
 * X0,Y0,Z0	point on curve to multiply, in projective coordinates
 * n	we work modulo n
 * S	scalar multiple
 * L	length of S in bits");
*/
void ScalarMultiplyEdwards(mpz_t d, mpz_t a, epoint* P, epoint* SP, mpz_t n, mpz_t S) {
        
    mpz_t A, B, B2, B3, C, dC, B2mC, D, CaD, B2mCmD, E, EmD, F, AF, G, AG, aC, DmaC, H, Hx2, J;
    mpz_t X0aY0, X0aY0xB, X0aY0xB_mCmD, X, Y, Z, mulmod;
    
    mpz_init(A); mpz_init(B); mpz_init(B2); mpz_init(B3); mpz_init(C); mpz_init(dC); mpz_init(B2mC); 
    mpz_init(D); mpz_init(CaD); mpz_init(B2mCmD); mpz_init(E); mpz_init(EmD); mpz_init(F); mpz_init(AF);
    mpz_init(G); mpz_init(AG); mpz_init(aC); mpz_init(DmaC); mpz_init(H); mpz_init(Hx2); mpz_init(J);
    mpz_init(X0aY0); mpz_init(X0aY0xB); mpz_init(X0aY0xB_mCmD); mpz_init(X); mpz_init(Y); mpz_init(Z);
    
    mpz_mod(SP->x, P->x, n);
    mpz_mod(SP->y, P->y, n);
    mpz_mod(SP->z, P->z, n);
    
    mpz_init(mulmod);
    
    // Compute exponentiation chain    
    expchain* chain = new expchain();
    mpz_set(chain->S, S);
    chain->L = (unsigned long long)mpz_sizeinbase(chain->S, 2); // exact for base = power of 2
      
    // Scalar multiplication using double & add algorithm
    // doubling formula: [2](x:y:z) = ((B-C-D)*J:F*(E-D):F*J)
    for(unsigned long long i = 2; i <= chain->L; i++) {
        // double
        mpz_add(B, SP->x, SP->y);
        mpz_mul(mulmod, B, B); mpz_mod(B2, mulmod, n);
        mpz_mul(mulmod, SP->x, SP->x); mpz_mod(C, mulmod, n);
        mpz_mul(mulmod, SP->y, SP->y); mpz_mod(D, mulmod, n);
        mpz_mul(E, a, C);
        mpz_add(F, E, D);
        mpz_mul(mulmod, SP->z, SP->z); mpz_mod(H, mulmod, n);
        mpz_mul_2exp(Hx2, H, 1);
        mpz_sub(J, F, Hx2);
        mpz_add(CaD, C, D);
        mpz_sub(B2mCmD, B2, CaD);
        mpz_mul(X, B2mCmD, J);
        mpz_sub(EmD, E, D);
        mpz_mul(Y, F, EmD);
        mpz_mul(Z, F, J);
        mpz_mod(SP->x, X, n);
        mpz_mod(SP->y, Y, n);
        mpz_mod(SP->z, Z, n);
        if(mpz_tstbit(chain->S, chain->L - i) == 1) {
            // add
            mpz_mul(A, P->z, SP->z);
            mpz_add(B, SP->x, SP->y);
            mpz_mul(mulmod, A, A); mpz_mod(B3, mulmod, n);
            mpz_mul(C, P->x, SP->x);
            mpz_mul(D, P->y, SP->y);
            mpz_mul(dC, d, C);
            mpz_add(CaD, C, D);
            mpz_mul(mulmod, dC, D); mpz_mod(E, mulmod, n);
            mpz_sub(F, B3, E);
            mpz_add(G, B3, E);
            mpz_add(X0aY0, P->x, P->y);
            mpz_mul(mulmod, X0aY0, B); mpz_mod(X0aY0xB, mulmod, n);
            mpz_sub(X0aY0xB_mCmD, X0aY0xB, CaD);
            mpz_mul(mulmod, A, F); mpz_mod(AF, mulmod, n);
            mpz_mul(X, AF, X0aY0xB_mCmD);
            mpz_mul(mulmod, A, G); mpz_mod(AG, mulmod, n);
            mpz_mul(aC, a, C);
            mpz_sub(DmaC, D, aC);
            mpz_mul(Y, AG, DmaC);
            mpz_mul(Z, F, G);
            mpz_mod(SP->x, X, n);
            mpz_mod(SP->y, Y, n);
            mpz_mod(SP->z, Z, n);
        }
    }
    delete chain;

	mpz_clear(mulmod);
    
    mpz_clear(A); mpz_clear(B); mpz_clear(B2); mpz_clear(B3); mpz_clear(C); mpz_clear(dC); mpz_clear(B2mC); 
    mpz_clear(D); mpz_clear(CaD); mpz_clear(B2mCmD); mpz_clear(E); mpz_clear(EmD); mpz_clear(F); mpz_clear(AF);
    mpz_clear(G); mpz_clear(AG); mpz_clear(aC); mpz_clear(DmaC); mpz_clear(H); mpz_clear(Hx2); mpz_clear(J);
    mpz_clear(X0aY0); mpz_clear(X0aY0xB); mpz_clear(X0aY0xB_mCmD); mpz_clear(X); mpz_clear(Y); mpz_clear(Z);
}


mp_bitcnt_t getpow(mpz_t Rmin, mpz_t R) {
    
    mp_bitcnt_t l = 1;
    
    // determine l such that Rmin < 2^l
    mpz_set_ui(R, 2);
    while (mpz_cmp(R, Rmin) <= 0) { mpz_mul_ui(R, R, 2); l++; }
    //cout << "\nl = " << l;
    //cout << "\nR = " << mpz_get_str(NULL, 10, R);
    return l;
}


/*
 * function to run ecm
 * 
 *      n       
 *      B1      
 *      B2      
 *      d1      
 *      startc      
 *      numc    
 */
void jklecm(mpz_t n, uint64_t B1, mpz_t B2, mpz_t B2dash, long k, long d1, long numj, bool Hessian, 
    long numE, long numH, bool verbose, long startc, long numc, bool Continue) {
    
    clock_t start_time;
    clock_t end_time;
    int ms1 = 0, ms2 = 0;
    
    // Calculate scalar multiple from B1 using tree algorithm to speed lcm calculations
	if (verbose)
		cout << "Calculating lcm...\n";
	std::cout << flush;
    start_time = clock();
    mpz_t S; mpz_init(S); GetlcmScalar(B1, S);
    end_time = clock();
	if (verbose)
		cout << "Computation of scalar multiple took " << (end_time - start_time)*1000.0/(double)CLOCKS_PER_SEC << "ms\n";
	std::cout << flush;
    
    //cout << mpz_get_str(NULL, 10, S) << "\n";
    
    cout << "Factoring...\n";
    int stage1ms = 0;
    mpz_t d, a, X0, Y0, Z0, p;
    mpz_init(d); mpz_init(a); mpz_init(X0); mpz_init(Y0); mpz_init(Z0), mpz_init(p);    
    bool factored = false;
    long c = startc;
    while (!factored && c < (Hessian?numH:numE) && ((c-startc) < numc)) {
        // stage 1
		if (verbose) {
			cout << "\nB1:" << B1 << " B2:" << mpz_class(B2).get_str(10) << " B2':" << mpz_class(B2dash).get_str(10) << " d1:" << d1 << " k:" << k << "\n";        
		}
		if (Hessian) {
            mpz_set_str(d, curvedata3[0][c].c_str(), 10);
            mpz_set_str(a, curvedata3[1][c].c_str(), 10);
            mpz_set_str(X0, curvedata3[2][c].c_str(), 10);
            mpz_set_str(Y0, curvedata3[3][c].c_str(), 10);
            mpz_set_str(Z0, curvedata3[4][c].c_str(), 10);
        }
        else {
            mpz_set_str(d, curvedata1[0][c].c_str(), 10);
            mpz_set_str(a, curvedata1[1][c].c_str(), 10);
            mpz_set_str(X0, curvedata1[2][c].c_str(), 10);
            mpz_set_str(Y0, curvedata1[3][c].c_str(), 10);
            mpz_set_str(Z0, curvedata1[4][c].c_str(), 10);
        }
		cout << "\nRunning curve " << c + 1 - startc << " of " << numc << " (curve id is #" << c + 1 << ")\n"; 
		if (verbose) {       
			cout << (Hessian?"\nHessian":"\nEdwards") << " curve parameters: a: " << mpz_class(a).get_str(10) << " d: " << mpz_get_str(NULL, 10, d);
			cout << "\nwhere " << (Hessian?"a*X^3 + Y^3 + Z^3 = d*X*Y*Z\n":"a*X^2*Z^2 + Y^2*Z^2 = Z^4 + d*X^2*Y^2\n");
		}
		std::cout << flush;
        start_time = clock();
        epoint* P = new epoint(X0, Y0, Z0);
        epoint* SP = new epoint();
        if (Hessian) ScalarMultiplyHessian(d, a, P, SP, n, S);
        else ScalarMultiplyEdwards(d, a, P, SP, n, S);
        // We hope that #E(Fp) is B smooth and #E(Fq) is not, where n=pq
        mpz_gcd(p, n, SP->x);
		end_time = clock();
        stage1ms = (end_time - start_time)*1000.0/(double)CLOCKS_PER_SEC;
        if (true) { //stage1ms > 10000) {
            //cout << c + 1 << " (1:" << stage1ms << "ms";
            cout << "Stage 1 took " << stage1ms << "ms\n";
			std::cout << flush;
        }
        if (mpz_cmp_ui(p, 1) > 0 && mpz_cmp(n, p) > 0) {
            cout << "Factored in stage 1 using curve #" << c + 1 << "\n";
			int pr = mpz_probab_prime_p(p, 25);
            cout << mpz_class(p).get_str(10) << " (" << mpz_class(p).get_str(10).length() << " digits - " << (pr==2?"prime":(pr==1?"probable prime":"composite")) << ")\n";
            std::cout << flush;
            
            mpz_t cofactor; mpz_init(cofactor);
            mpz_fdiv_q(cofactor, n, p);
            pr = mpz_probab_prime_p(cofactor, 25); // reps = 25 gives error prob < 2^(-50)
            
            if (Continue && pr == 0) {
                cout << "Cofactor is composite, continuing...\n";
                factored = false;
                mpz_set(n, cofactor);
                cout << "Number to factor: \n" << mpz_class(n).get_str(10) << " (" << mpz_class(n).get_str(10).length() << " digits)\n";
                std::cout << flush;
            }
            else {
                factored = true;
            }
            mpz_clear(cofactor);
        }
        else {
            if (mpz_cmp_ui(B2, B1) > 0) {
                int e = 1;
                factored = stage2(n, d1, numj, k, e, p, c, Hessian, d, a, SP, verbose);
                
                if (factored) {
					int pr = mpz_probab_prime_p(p, 25);
                    cout << mpz_class(p).get_str(10) << " (" << mpz_class(p).get_str(10).length() << " digits - " << (pr==2?"prime":(pr==1?"probable prime":"composite")) << ")\n";
                    std::cout << flush;
                    
                    mpz_t cofactor; mpz_init(cofactor);
                    mpz_fdiv_q(cofactor, n, p);
                    pr = mpz_probab_prime_p(cofactor, 25); // reps = 25 gives error prob < 2^(-50)

                    if (Continue && pr == 0) {
                        cout << "Cofactor is composite, continuing...\n";
                        factored = false;
                        mpz_set(n, cofactor);
                        cout << "Number to factor: \n" << mpz_class(n).get_str(10) << " (" << mpz_class(n).get_str(10).length() << " digits)\n";
                        std::cout << flush;
                    }
                    else {
                        factored = true;
                    }
                    mpz_clear(cofactor);
                }
            }
        }
        if (!factored) c++;
        delete P;
		delete SP;
    }
    
    if (!factored) {
        cout << "\nCould not find a non-trivial factor.\n";
    }
	std::cout << flush;
    
    // release resources
    mpz_clear(d); mpz_clear(a); mpz_clear(X0); mpz_clear(Y0); mpz_clear(Z0), mpz_clear(p); mpz_clear(S);
}


//
//    n         number to be factored
//    d1        product of very small primes - should be about sqrt(B2)
//	  numj		number of roots of F (degree of F)
//    k         number of stage 2 blocks
//    e         exponent for Brent-Suyama extension (degree of Dickson polynomial)
//    out       output factor
//    c         curve number
//    Hessian   true to use Hessian Z6xZ6 family, false to use Edwards Z4xZ8 family
//    d         twisted Hessian/Edwards curve d parameter
//    a         twisted Hessian/Edwards curve a parameter
//    SP        stage 1 multiple of point P, i.e. [S]P
//    verbose   switch for verbose output
//
bool stage2(mpz_t n, long d1, long numj, int k, int e,
        mpz_t out, int c, bool Hessian, mpz_t d, mpz_t a, epoint* SP, bool verbose)
{
    // Stage 2: Overview
    //
    //   1.  Compute roots of F
    //          These are computed as Psi[t] below
    //   2.  Construct F from its roots
    //          The product tree using Kronecker-Schonhage fast polynomial 'integer'
    //          multiplication is used
    //   3.  Compute 1/F
    //          The 'pseudo-inverse' of F, i.e. the precision-k reciprocal of F.
    //          To calculate this, we use the method in Bernstein's Fast Multiplication
    //          and its Applications
    //   4.  Let H = 1
    //   5.  Compute roots of Gj
    //          The entire set of 'giant step' roots Y([i*d1]SP), for i = 1...B2/d1
    //          is divided into k blocks, giving the roots of polynomials G1...Gk
    //          Each root is computed as Ya below.
    //          Note that d1 has been calculated as d1 = sqrt(b2), where B2 = k*b2
    //          The sets S1...Sk of roots cover all multiples of d1 up to B2
    //   6.  Construct Gj from its roots
    //          Same method as step 2
    //   7.  Let H = H*Gj (mod F)
    //          Multiplication is done via Kronecker-Schonhage.
    //          Reduction mod F is done using method of Bernstein, again see
    //          Fast Multiplication and its Applications
    //   8.  Repeat steps 5-7 for each j up to k
    //   9.  Evaluate H at the roots of F
    //          Bernstein's scaled remainder tree algorithm is used, see Bernstein
    //          Scaled Remainder Trees
    //          Compute product by multiplying together all residues mod n
    //   10. Compute gcd(product, n)
    //
    
    bool factored = false;
	clock_t total_start_time;
	clock_t total_end_time;
    clock_t start_time;
    clock_t end_time;
    int ms1 = 0, ms2 = 0, stage2ms = 0;
	if (verbose) {
        cout << "\nDegree of F is " << numj;
		std::cout << flush;
    }
	total_start_time = clock();
    start_time = total_start_time;

    //   1.  Compute roots of F	
	mp_bitcnt_t bits_in_n = mpz_sizeinbase(n, 2) + 64;
    epoint *j1, *j7, *j11, *j13, *j30, *Q; 
    j1 = new epoint(); j7 = new epoint(); j11 = new epoint(); j13 = new epoint(); j30 = new epoint();
    Q = new epoint();
    mpz_t s7, s11, s13, s30;
    mpz_init_set_ui(s7, 7); mpz_init_set_ui(s11, 11); mpz_init_set_ui(s13, 13); mpz_init_set_ui(s30, 30);

	// initialize stage 2 product mod n
    mpz_t product;
    mpz_init_set_ui(product, 1);

    // precompute baby steps
    mpz_t ZjSinv, YjSxZjSinv, mulmod;
    mpz_init(ZjSinv); mpz_init(YjSxZjSinv); mpz_init(mulmod);
    mpz_t* Psi = new mpz_t[numj];	
    for (long i = 0; i < numj; i++) mpz_init2(Psi[i], bits_in_n);
    /*
     * FFT Continuation
     * =====================
     * We want to compute points [p]SP, where p is every prime
     * in the interval [B1+1,B2].  We want to avoid having to
     * compute each prime multiple individually, so we use the
     * FFT continuation.  Say that all primes p = i*d1+j.
     * Since we must get primes, gcd(i*d1,j) = 1 necessarily.
     * Also, the method uses that Edwards y_coord([i*d1]SP) = y_coord(±[j]SP)
     * where affine y_coord(P) = y_coord(-P). For Hessian curves,
     * projective X_coord(P) = X_coord(-P). So we have certain
     * redundant j values. We assume 30 divides d1 and we include
     * only the 'positive' residues mod 30 that are coprime to d1.
     * These are 1,7,11,13 (mod 30). We let j30 = [30]SP
     * and calculate j1 = SP, j7 = [7]SP, j11 = [11]SP, j13 = [13]SP
     * and instead of looping over all j, we just keep adding j30
     * repeatedly to each of j1, j7, j11, j13.  This will give us
     * all required j multiples needed and skips some j values
     * that we don't need to consider, or have no chance of
     * producing a prime value of i*d1+j (e.g. if gcd(30,j) > 1).
     * This saves time because we only consider a subset of all j's.
     * A heuristic is that we only consider 2 in every 15 j values.
     * 
     * Note that the values of [j]SP are the roots of F.  We only
     * keep a j value if gcd(d1,j) = 1, and there are phi(d1)/2 of
     * these, up to sign.
     */
    j1->set(SP->x, SP->y, SP->z);
    if (Hessian) {
        ScalarMultiplyHessian(d, a, j1, j7, n, s7);
        ScalarMultiplyHessian(d, a, j1, j11, n, s11);
        ScalarMultiplyHessian(d, a, j1, j13, n, s13);
        ScalarMultiplyHessian(d, a, j1, j30, n, s30);        
    }
    else {
        ScalarMultiplyEdwards(d, a, j1, j7, n, s7);
        ScalarMultiplyEdwards(d, a, j1, j11, n, s11);
        ScalarMultiplyEdwards(d, a, j1, j13, n, s13);
        ScalarMultiplyEdwards(d, a, j1, j30, n, s30);
    }
    long t = 0;      // Psi[t] holds the y coordinates that we keep
    for (long j = 0; j < d1; j+=30) {
        if (j + 1 < d1 && gcd(d1, j+1) == 1L) {
            if (Hessian) {
				mpz_mul(mulmod, j1->y, j1->z);
				mpz_invert(ZjSinv, mulmod, n);
                mpz_mul(mulmod, j1->x, j1->x);
				mpz_mod(mulmod, mulmod, n);
				mpz_mul(mulmod, mulmod, ZjSinv);
				mpz_mod(mulmod, mulmod, n);
                mpz_set(Psi[t], mulmod);
            } 
            else {
                mpz_invert(ZjSinv, j1->z, n);
                mpz_mul(mulmod, j1->y, ZjSinv);
				mpz_mod(mulmod, mulmod, n);
                mpz_set(Psi[t], mulmod);
            }
            t++;
        }
        if (j + 7 < d1 && gcd(d1, j+7) == 1L) {
            if (Hessian) {
				mpz_mul(mulmod, j7->y, j7->z);
				mpz_invert(ZjSinv, mulmod, n);
                mpz_mul(mulmod, j7->x, j7->x);
				mpz_mod(mulmod, mulmod, n);
				mpz_mul(mulmod, mulmod, ZjSinv);
				mpz_mod(mulmod, mulmod, n);
                mpz_set(Psi[t], mulmod);
            } 
            else {
                mpz_invert(ZjSinv, j7->z, n);
                mpz_mul(mulmod, j7->y, ZjSinv);
				mpz_mod(mulmod, mulmod, n);
                mpz_set(Psi[t], mulmod);
            }
            t++;
        }
        if (j + 11 < d1 && gcd(d1, j+11) == 1L) {
            if (Hessian) {
				mpz_mul(mulmod, j11->y, j11->z);
				mpz_invert(ZjSinv, mulmod, n);
                mpz_mul(mulmod, j11->x, j11->x);
				mpz_mod(mulmod, mulmod, n);
				mpz_mul(mulmod, mulmod, ZjSinv);
				mpz_mod(mulmod, mulmod, n);
                mpz_set(Psi[t], mulmod);
            } 
            else {
                mpz_invert(ZjSinv, j11->z, n);
                mpz_mul(mulmod, j11->y, ZjSinv);
				mpz_mod(mulmod, mulmod, n);
                mpz_set(Psi[t], mulmod);
            }
            t++;
        }
        if (j + 13 < d1 && gcd(d1, j+13) == 1L) {
            if (Hessian) {
				mpz_mul(mulmod, j13->y, j13->z);
				mpz_invert(ZjSinv, mulmod, n);
                mpz_mul(mulmod, j13->x, j13->x);
				mpz_mod(mulmod, mulmod, n);
				mpz_mul(mulmod, mulmod, ZjSinv);
				mpz_mod(mulmod, mulmod, n);
                mpz_set(Psi[t], mulmod);
            } 
            else {
                mpz_invert(ZjSinv, j13->z, n);
                mpz_mul(mulmod, j13->y, ZjSinv);
				mpz_mod(mulmod, mulmod, n);
                mpz_set(Psi[t], mulmod);
            }
            t++;
        }
        if (Hessian) {
            HEnAdd(d, a, j1, j30, Q, n); j1->set(Q->x, Q->y, Q->z);
            HEnAdd(d, a, j7, j30, Q, n); j7->set(Q->x, Q->y, Q->z);
            HEnAdd(d, a, j11, j30, Q, n); j11->set(Q->x, Q->y, Q->z);
            HEnAdd(d, a, j13, j30, Q, n); j13->set(Q->x, Q->y, Q->z);        
        }
        else {
            EEnAdd(d, a, j1, j30, Q, n); j1->set(Q->x, Q->y, Q->z);
            EEnAdd(d, a, j7, j30, Q, n); j7->set(Q->x, Q->y, Q->z);
            EEnAdd(d, a, j11, j30, Q, n); j11->set(Q->x, Q->y, Q->z);
            EEnAdd(d, a, j13, j30, Q, n); j13->set(Q->x, Q->y, Q->z);
        }
    }
    long tmax = t;
    end_time = clock();
    stage2ms = (end_time - start_time)*1000.0/(double)CLOCKS_PER_SEC;
    if (verbose) {
        cout << "\nComputing roots of F took " << stage2ms << "ms";
		std::cout << flush;
    }
    
    //   2.  Construct F from its roots
    //   Note:  The roots of F are Psi[t] for t = 0...tmax-1
    start_time = clock();
    long degF = tmax;
    long numnodes = NumProductTreeNodes(degF);
    long numpairs = (numnodes-1)/2;
    long* pairs = new long[numpairs*2];        // one pair takes up two consecutive array positions
    long* polydegrees = new long[numnodes];
    long totalcoeffs = GetPolyProductInfo(degF, numnodes, numpairs, pairs, polydegrees);
    // allocate memory for entire product tree
    mpz_t* Tree = new mpz_t[totalcoeffs];
	for (long i = 0; i < totalcoeffs; i++) mpz_init2(Tree[i], bits_in_n);
    // This version (PolyIntProductTree2) also returns the entire tree
    PolyIntProductTree2(Tree, polydegrees, Psi, degF, n, numnodes, totalcoeffs);
    // we keep a reversed copy of F = Tree[0]
    mpz_t* revF = new mpz_t[degF+1];
    for (long i = 0; i < degF+1; i++) mpz_init_set(revF[i], Tree[degF-i]);
    end_time = clock();
    stage2ms = (end_time - start_time)*1000.0/(double)CLOCKS_PER_SEC;
    if (verbose) {
        cout << "\nConstructing F from its roots took " << stage2ms << "ms";
		std::cout << flush;
    }

    //   3.  Compute 1/F
    start_time = clock();
    mpz_t* invF = new mpz_t[degF+1];
	for (long i = 0; i < degF+1; i++) mpz_init2(invF[i], bits_in_n);
    PolyReciprocal(Tree, degF+1, n, invF);
    mpz_t* revinvF = new mpz_t[degF];
    for (long i = 0; i < degF; i++) mpz_init_set(revinvF[i], invF[degF-1-i]);
    // compute reverse of 1/F

    end_time = clock();
    stage2ms = (end_time - start_time)*1000.0/(double)CLOCKS_PER_SEC; 
    if (verbose) {
        cout << "\nComputing precision-n reciprocal of F took " << stage2ms << "ms";
		std::cout << flush;
    }
    
    //   4.  Let H = 1
    long degG = degF - 1;
    mpz_t* H = new mpz_t[degG+1];	// H = H (mod F)
    for (uint64_t i = 0; i < degG+1; i++) mpz_init2(H[i], bits_in_n);
    mpz_set_ui(H[0], 1);
    
    epoint* d1SP = new epoint();
    epoint* id1SP = new epoint();
    epoint* id1SPnew = new epoint();
    
    // calculate [d1*S](X:Y:Z)
    mpz_t sd1; mpz_init_set_ui(sd1, d1);
    if (Hessian) ScalarMultiplyHessian(d, a, SP, d1SP, n, sd1);
    else ScalarMultiplyEdwards(d, a, SP, d1SP, n, sd1);
    id1SP->set(d1SP->x, d1SP->y, d1SP->z);	// i = 1

    mpz_t Ya, Zid1Sinv, YamPsit;
    mpz_init(Ya); mpz_init(Zid1Sinv); mpz_init(YamPsit);
        
    // Initialize array to hold roots of Gj
    mpz_t* Gjroots = new mpz_t[degG];
    for (long i = 0; i < degG; i++) mpz_init2(Gjroots[i], bits_in_n);    
    // Initialize array to hold Gj
    mpz_t* Gj = new mpz_t[degG + 1];
    for (long i = 0; i < degG+1; i++) mpz_init2(Gj[i], bits_in_n);
    // Initialize array to hold H*Gj
    mpz_t* HGj = new mpz_t[2*degG + 1];
    for (long i = 0; i < 2*degG+1; i++) mpz_init2(HGj[i], bits_in_n);
    // Initialize array to hold H*Gj/F
    mpz_t* HGjdivF = new mpz_t[2*degG+degF+1];
    for (long i = 0; i < 2*degG+degF+1; i++) mpz_init2(HGjdivF[i], bits_in_n);
    // Initialize array to hold q
    mpz_t* q = new mpz_t[degG+1];
    for (long i = 0; i < degG+1; i++) mpz_init2(q[i], bits_in_n);
    // Initialize array to hold q*F
    mpz_t* qF = new mpz_t[degG + degF + 1];
    for (long i = 0; i < degG+degF+1; i++) mpz_init2(qF[i], bits_in_n);
    	
    long numnodes2 = NumProductTreeNodes(degG);
    long numpairs2 = (numnodes-1)/2;
    long* pairs2 = new long[numpairs*2];        // one pair takes up two consecutive array positions
    long* polydegrees2 = new long[numnodes];

    // execute k blocks covering primes i*d1+j for i = 0...(k-1)*degG - 1, j = 1...degF
	mpz_t coeff; mpz_init2(coeff, bits_in_n);
    long istart = 1;
    for (long j = 0; j < k; j++) {
        start_time = clock();

        //   5.  Compute roots of Gj
        for (long i = istart; i < istart + degG; i++) { // note i,j above were for illustration 
            if (Hessian) {
				mpz_mul(mulmod, id1SP->y, id1SP->z);
				mpz_invert(Zid1Sinv, mulmod, n);
                mpz_mul(mulmod, id1SP->x, id1SP->x);
				mpz_mod(mulmod, mulmod, n);
				mpz_mul(mulmod, mulmod, Zid1Sinv);
				mpz_mod(Ya, mulmod, n);
                mpz_set(Gjroots[i-istart], Ya);
            } 
            else {
                mpz_invert(Zid1Sinv, id1SP->z, n);
                mpz_mul(mulmod, id1SP->y, Zid1Sinv);
                mpz_mod(Ya, mulmod, n);
                mpz_set(Gjroots[i-istart], Ya);
            }
			// giant step
            if (Hessian) HEnAdd(d, a, d1SP, id1SP, id1SPnew, n);
            else EEnAdd(d, a, d1SP, id1SP, id1SPnew, n);
            id1SP->set(id1SPnew->x, id1SPnew->y, id1SPnew->z);
        }
        end_time = clock();
        stage2ms = (end_time - start_time)*1000.0/(double)CLOCKS_PER_SEC;
        if (verbose) {
            cout << "\nComputing roots of G" << j + 1 << " took " << stage2ms << "ms";
			std::cout << flush;
        }

        //   6.  Construct Gj from its roots
        //   Note:  The roots of Gj are Gjroots[i] for i = 0...degG-1
        start_time = clock();
        GetPolyProductInfo(degG, numnodes2, numpairs2, pairs2, polydegrees2);
        // This version (PolyIntProductTree) just computes the product
        PolyIntProductTree(Gj, Gjroots, polydegrees2, degG, n, numnodes2);
        end_time = clock();
        stage2ms = (end_time - start_time)*1000.0/(double)CLOCKS_PER_SEC;
        if (verbose) {
            cout << "\nConstructing G" << j + 1 << " from its roots took " << stage2ms << "ms";
			std::cout << flush;
        }
		
        // this block only executes if j > 0 (hence k > 1) , and otherwise, G1 is already reduced mod F
        if (j > 0) {
            //   7.  Let H = H*Gj (mod F)
            start_time = clock();
            PolyIntProduct(H, Gj, degG, degG, n, HGj);		
            end_time = clock();
            stage2ms = (end_time - start_time)*1000.0/(double)CLOCKS_PER_SEC; 
            if (verbose) {
                cout << "\nComputing product H * G" << j + 1 << " took " << stage2ms << "ms";
				std::cout << flush;
            }

            start_time = clock();
            // now reduce HGj (mod F) using method shown in Fast Multiplication and its Applications (Bernstein)
            // first get degree of HGj, which might be less than 2*degG
            long degHGj = getPolyDegree(HGj, degG*2);
            // now reverse HGj
            for (long i = 0; i < (degHGj+1)/2; i++) {
                mpz_set(coeff, HGj[i]);
                mpz_set(HGj[i], HGj[degHGj-i]);
                mpz_set(HGj[degHGj-i], coeff);
            }
            PolyIntProduct(HGj, invF, degHGj, degF, n, HGjdivF);
            // Note that deg HGj < e.
            // reduce mod x^(e-d), i.e. discard all powers of x not less than e-d
            // where e-1 is the degree of HGj and d that of 1/F (precision degF+1)
            // here e = degHGj+1, d = degF, so e-d = degHGj+1-degF
            // we also reverse at the same time to give polynomial q
            // deg(qF) = deg(q) + deg(F) = degHGj-degF + degF = degHGj
            for (long i = 0; i < degHGj+1-degF; i++) mpz_set(q[i], HGjdivF[degHGj-degF-i]);
            // "un"-reverse HGj
            for (long i = 0; i < (degHGj+1)/2; i++) {
                mpz_set(coeff, HGj[i]);
                mpz_set(HGj[i], HGj[degHGj-i]);
                mpz_set(HGj[degHGj-i], coeff);
            }            
            // now compute r = HGj - q*F.  This is H*Gj (mod F)
            PolyIntProduct(q, revF, degG, degF, n, qF);
            // subtract qF from HGj one coefficient at a time, for degF coefficients
            for (long i = 0; i < degF; i++) {
                mpz_sub(coeff, HGj[i], qF[i]);
                mpz_mod(coeff, coeff, n);
                mpz_set(H[i], coeff);
            }
            end_time = clock();
            stage2ms = (end_time - start_time)*1000.0/(double)CLOCKS_PER_SEC;
            if (verbose) {
                cout << "\nReducing H = H * G" << j + 1 << " (mod F) took " << stage2ms << "ms";
				std::cout << flush;
            }
        }
        else {
            for (long i = 0; i < degF; i++) mpz_set(H[i], Gj[i]);
        }
		
		istart += degG;
    }
	mpz_clear(coeff);

    delete[] pairs2;
    delete[] polydegrees2;
    
    //   9.  Evaluate H at the roots of F
    PolyEval(H, Tree, pairs, polydegrees, Psi, revinvF, degG, degF, n, product);
        
    mpz_gcd(out, n, product);
    end_time = clock();
    stage2ms = (end_time - start_time)*1000.0/(double)CLOCKS_PER_SEC;
    if (verbose) {
        cout << "\nEvaluating H at the roots of F took " << stage2ms << "ms\n";
		std::cout << flush;
    }
		
    // release Tree and other polynomial memory
    for (long i = 0; i < degG+degF+1; i++) mpz_clear(qF[i]);
    delete[] qF;
    for (long i = 0; i < degG+1; i++) mpz_clear(q[i]);
    delete[] q;
    for (long i = 0; i < 2*degG+degF+1; i++) mpz_clear(HGjdivF[i]);
    delete[] HGjdivF;
    for (long i = 0; i < 2*degG+1; i++) mpz_clear(HGj[i]);
    delete[] HGj;
    for (long i = 0; i < degG+1; i++) mpz_clear(Gj[i]);
    delete[] Gj;
    for (long i = 0; i < degG; i++) mpz_clear(Gjroots[i]);    
    delete[] Gjroots;
    mpz_clear(YamPsit); mpz_clear(Zid1Sinv); mpz_clear(Ya);
	mpz_clear(sd1);
	delete id1SPnew;
	delete id1SP;
	delete d1SP;
    for (long i = 0; i < degG+1; i++) mpz_clear(H[i]);
    delete[] H;
    for (long i = 0; i < degF; i++) mpz_clear(revinvF[i]);
    delete[] revinvF;
    for (long i = 0; i < degF+1; i++) mpz_clear(invF[i]);
    delete[] invF;
    for (long i = 0; i < degF+1; i++) mpz_clear(revF[i]);
    delete[] revF;
    for (long i = 0; i < totalcoeffs; i++) mpz_clear(Tree[i]);
    delete[] Tree;

    total_end_time = clock();
    stage2ms = (end_time - start_time)*1000.0/(double)CLOCKS_PER_SEC;
    int stage2totalms = (total_end_time - total_start_time)*1000.0/(double)CLOCKS_PER_SEC;
    
	cout << (verbose?"\n":"") << "Stage 2 took " << stage2totalms << "ms";
	std::cout << flush;
    if (mpz_cmp_ui(out, 1) > 0 && mpz_cmp(n, out) > 0) {
        cout << "\nFactored in stage 2 using curve #" << c + 1 << "\n";
        factored = true;
    }
    
    // release resources
	delete[] polydegrees;
	delete[] pairs;
    for (int i = 0; i < numj; i++) mpz_clear(Psi[i]);
    delete[] Psi;
	mpz_clear(mulmod);
	mpz_clear(YjSxZjSinv);
    mpz_clear(ZjSinv);
    mpz_clear(product);
    mpz_clear(s30); mpz_clear(s13); mpz_clear(s11); mpz_clear(s7);
    delete Q;
	delete j30;
	delete j13;
	delete j11;
	delete j7;
    delete j1;
    
    return factored;
}

long getPolyDegree(mpz_t* H, long degHmax)
{
    long degH = degHmax;
    while (mpz_cmp_ui(H[degH], 0) == 0) degH--;
    return degH;
}

long NumProductTreeNodes(long numroots)
{
    long s = 0;
    long r = numroots;
    long n = 0;
    while(r > 0) {
        s = log((double)r)/log((double)2);
        if (r > 1<<s) n++;
        n += (1<<(s+1))-1;
        r -= 1<<s;
    }    
    return n;
}

long GetPolyProductInfo(long numroots, long numnodes, long numpairs, long* pairs, long* polydegrees)
{
    long totalcoeffs = numroots*2;
    long* prepairs = new long[numnodes*2];
    // set numroots root nodes to have left and right null nodes
    for (long i = 0; i < numroots; i++) { prepairs[2*i] = -1; prepairs[2*i+1] = -1; polydegrees[numnodes-1-i] = 1; }
    long* arr1 = new long[numroots];
    long* arr2 = new long[numroots];
    // populate initial values of arr
    for (long i = 0; i < numroots; i++) { arr1[i] = i; arr2[i] = 1; }
    // coalesce arr as products of polynomials
    long arraypos = numroots - 1;
    long degree = 1;
    long outpos = numroots;
    while (arraypos > 0) {                       
        for (long i = 0; i <= arraypos; i += 2) {
            if (i < arraypos) {
                // the degree of the product of two polynomials is the sum of the degrees
                degree = arr2[i] + arr2[i+1];
                arr2[i/2] = degree;
                polydegrees[numnodes-1-outpos] = degree;
                totalcoeffs += degree + 1;
                // then coalesce the node positions
                prepairs[outpos*2] = arr1[i];
                prepairs[outpos*2+1] = arr1[i+1];
                arr1[i/2] = outpos;
                outpos++;
            }                
            else {
                arr1[i/2] = arr1[i];
                arr2[i/2] = arr2[i];
            }
        }
        for (int i = (arraypos >> 1); i < arraypos - 1; i++) { arr1[i + 1] = 0; arr2[i + 1] = 1; }
        arraypos = arraypos/2;
    }
    // traverse tree and output pairs
    std::stack<long> tree;
    tree.push(numnodes-1);
    outpos = 0;
    while (!tree.empty()) {
        long node = tree.top();
        tree.pop();
        long left = prepairs[node*2];
        long right = prepairs[node*2+1];
        if (right != -1) { tree.push(right); pairs[outpos] = numpairs*2-right; outpos++; }
        if (left != -1) { tree.push(left); pairs[outpos] = numpairs*2-left; outpos++; }
    }

    delete[] arr1;
    delete[] arr2;
    delete[] prepairs;

    return totalcoeffs;
}

long getTreePolyStart(long* polydegrees, long polynum)
{
    long treePolyStart = 0;
    for (long i = 0; i < polynum; i++)
        treePolyStart = treePolyStart + polydegrees[i] + 1;
    return treePolyStart + polydegrees[polynum];
}

/*
 */
void PolyEval(mpz_t* H, mpz_t* Tree, long* pairs, long* polydegrees, 
        mpz_t* Froots, mpz_t* invF, long degH, long degF, mpz_t n, mpz_t product)
{
    std::stack<mpz_t*> nodePQs;
    std::stack<long> degnodePQs;
    long degnodePQ;
    long degP, degQ;
    long d = 0;
    mp_bitcnt_t bits_in_n = mpz_sizeinbase(n, 2) + 64;
    
    // set product to 1
    mpz_set_ui(product, 1);

    // push H*1/F
    mpz_t* HdivF = new mpz_t[degH+degF];
    for (long i = 0; i < degH+degF; i++) mpz_init2(HdivF[i], bits_in_n);
    PolyIntProduct(H, invF, degH, degF-1, n, HdivF);
    mpz_t* nodePQ = new mpz_t[degF];
    for (long i = 0; i < degF; i++) mpz_init_set(nodePQ[i], HdivF[i+degH]);
    nodePQs.push(nodePQ);
	degnodePQ = degF-1;
    degnodePQs.push(degnodePQ);
	
    // initialize nodeP
	degP = (polydegrees[1]>polydegrees[2]?polydegrees[1]:polydegrees[2]);	// maximum value
    mpz_t* nodeP = new mpz_t[degnodePQ+degP+1];
    for (long i = 0; i < degnodePQ+degP+1; i++) mpz_init2(nodeP[i], bits_in_n);

    // initialize nodeQ
	degQ = degP;
    mpz_t* nodeQ = new mpz_t[degnodePQ+degQ+1];
    for (long i = 0; i < degnodePQ+degQ+1; i++) mpz_init2(nodeQ[i], bits_in_n);

    long t = 0;
    long treePolyStart = polydegrees[0] + 1;	// skip root polynomial in tree
    while (!nodePQs.empty()) {
        nodePQ = nodePQs.top();
        nodePQs.pop();
        degnodePQ = degnodePQs.top();
        degnodePQs.pop();
        degP = polydegrees[pairs[t+1]];
        degQ = polydegrees[pairs[t]];	// Q is remembered, we traverse to P

        // set polynomial position in tree at P
        treePolyStart = getTreePolyStart(polydegrees, pairs[t+1]);

        // compute nodeP = nodePQ*P (mod x^degPQ-1)
        TreePolyIntProduct(nodePQ, Tree, treePolyStart, degnodePQ, degP, n, nodeP);
        // reduce mod x^(degnodePQ+1)-1
        for (long i = 0; i < degP; i++) {
            mpz_add(nodeP[i], nodeP[i+degnodePQ+1], nodeP[i]);
            mpz_mod(nodeP[i], nodeP[i], n);
        }
        d = degnodePQ - degP;
        if (d == 0) {
            //cout << "\n" << mpz_get_str(NULL, 10, nodeP[degnodePQ]);// << "\n";
            mpz_mul(product, product, nodeP[degP]);
            mpz_mod(product, product, n);
        }
        else {
            mpz_t* nextnode1 = new mpz_t[d+1];
            for (long i = 0; i < d+1; i++) mpz_init_set(nextnode1[i], nodeP[degnodePQ-d+i]);
            nodePQs.push(nextnode1);
            degnodePQs.push(d);
        }

        // set polynomial position in tree at Q
        treePolyStart = getTreePolyStart(polydegrees, pairs[t]);

        // compute nodeQ = nodePQ*Q (mod x^(degnodePQ+1)-1)
        TreePolyIntProduct(nodePQ, Tree, treePolyStart, degnodePQ, degQ, n, nodeQ);
        
		// reduce mod x^(degnodePQ+1)-1
        for (long i = 0; i < degQ; i++) {
            mpz_add(nodeQ[i], nodeQ[i+degnodePQ+1], nodeQ[i]);
            mpz_mod(nodeQ[i], nodeQ[i], n);
        }
        d = degnodePQ - degQ;
        if (d == 0) {
            //cout << "\n" << mpz_get_str(NULL, 10, nodeQ[degnodePQ]);// << "\n";
            mpz_mul(product, product, nodeQ[degQ]);
            mpz_mod(product, product, n);
        }
        else {
            mpz_t* nextnode2 = new mpz_t[d+1];
            for (long i = 0; i < d+1; i++) mpz_init_set(nextnode2[i], nodeQ[degnodePQ-d+i]);
            nodePQs.push(nextnode2);
            degnodePQs.push(d);
        }

        // clear memory
        for (long i = 0; i < degnodePQ+1; i++) mpz_clear(nodePQ[i]);
        delete[] nodePQ;

        // advance to next pair of polynomials
        t += 2;
    }

	// clear memory
	degnodePQ = degF-1;
	degP = polydegrees[pairs[1]];
	degQ = degP;	//polydegrees[pairs[0]];  // fix for inconspicuous memory leak
	for (long i = 0; i < degnodePQ+degQ+1; i++) mpz_clear(nodeQ[i]);
	delete[] nodeQ;
	for (long i = 0; i < degnodePQ+degP+1; i++) mpz_clear(nodeP[i]);
	delete[] nodeP;	
    for (long i = 0; i < degH+degF; i++) mpz_clear(HdivF[i]);
	delete[] HdivF;
}

void getstr(mpz_t n)
{
    cout << "\n" << mpz_class(n).get_str(10) << "\n";
}

/*
 * Tree         pointer to output array representing polynomials in tree
 * polydegrees	pointer to input array of degrees of every product polynomial
 * roots        pointer to input array of roots of polynomial F
 * numroots     number of roots of F (degree of F)
 * n            we work in the ring of integers modulo n
 * numnodes		total number of nodes in the product tree
 * totalcoeffs	total number of coefficients of all polynomials in entire product tree
 */
void PolyIntProductTree2(mpz_t* Tree, long* polydegrees, mpz_t* roots, long numroots, mpz_t n, long numnodes, long totalcoeffs)
{
    mpz_t one; mpz_init_set_ui(one, 1);
    mp_bitcnt_t l;
	mp_bitcnt_t bits_in_n = mpz_sizeinbase(n, 2) + 64;

    // Create product arrays, where each element of arr1 holds 
    // the position in the tree array of a product tree node polynomial
    // and arr2 the corresponding polynomial degree
    long* arr1 = new long[numroots]();
    long* arr2 = new long[numroots]();
    long treepos = totalcoeffs - 1;
    for (long i = 0; i < numroots; i++) {
        arr1[i] = treepos;
        arr2[i] = polydegrees[numnodes - 1 - i];
        treepos -= (polydegrees[numnodes - 1 - i] + 1);
    }

    // allocate space for entire tree at once
    /*for (long i = 0; i < totalcoeffs; i++) 
        mpz_init2(Tree[i], bits_in_n);*/		// this led to subtle memory fragmentation here
    
    // Set polynomials at youngest nodes of tree
    treepos = totalcoeffs - 1;
    for (long i = 0; i < numroots; i++) {
        mpz_sub(Tree[treepos], Tree[treepos], roots[i]);	// constant term -roots[i]
        mpz_mod(Tree[treepos], Tree[treepos], n);			// reduce mod n
        mpz_set_ui(Tree[treepos-1], 1);      // monic polynomial of degree 1
        treepos -= 2;
    }

    long d = polydegrees[0];
    d = d+1;	// polynomial has degree strictly less than d now
    mpz_t dnn; mpz_init_set(dnn, n); mpz_mul(dnn, dnn, n); mpz_mul_ui(dnn, dnn, d); // compute d*n^2
    mpz_t R; mpz_init(R);
    l = getpow(dnn, R);		// get l such that 2^(l-1) < d*n^2 < 2^l = R
    while ((l%8)!=0) l++;	// increase l to byte boundary
    long ldiv8 = l / 8;		// number of bytes
    long prepolynumbytes = (numroots+1)*ldiv8;
    char* prepoly = new char[prepolynumbytes];		// *** will this take up a lot of memory? ***

	// compute number of bytes to store Tree[1]
	d = polydegrees[1];
    d = d+1;	// polynomial has degree strictly less than d now
    mpz_set(dnn, n); mpz_mul(dnn, dnn, n); mpz_mul_ui(dnn, dnn, d); // compute d*n^2
    l = getpow(dnn, R);		// get l such that 2^(l-1) < d*n^2 < 2^l = R
    while ((l%8)!=0) l++;	// increase l to byte boundary
    ldiv8 = l / 8;		// number of bytes
    long Tree1numbytes = d*ldiv8;

    // we use two structures, the tree (which we form node by node) and a product array,
    // which represents a level in the tree. The problem to overcome is level order traversal
    // of an in-order tree
    
    // Compute product tree
    mpz_t coeff;
    mpz_init(coeff);
    long arraypos = numroots - 1;
	mpz_t intP; mpz_init2(intP, Tree1numbytes*8 + 64);
	mpz_t intQ; mpz_init2(intQ, Tree1numbytes*8 + 64);
	mpz_t intPQ; mpz_init2(intPQ, prepolynumbytes*8 + 64);
    while (arraypos > 0) {        
        // 
        d = arr2[arraypos];
        if (arraypos > 0)
            if (arr2[arraypos-1] > d)
                d = arr2[arraypos-1];
        d = d+1;	// both polynomials have degree strictly less than d now
        mpz_set(dnn, n); mpz_mul(dnn, dnn, n); mpz_mul_ui(dnn, dnn, d); // compute d*n^2
        l = getpow(dnn, R);		// get l such that 2^(l-1) < d*n^2 < 2^l = R
        while ((l%8)!=0) l++;	// increase l to byte boundary
        ldiv8 = l / 8;		// number of bytes
        prepolynumbytes = (numroots+1)*ldiv8;

        for (long i = 0; i <= arraypos; i += 2) {
            if(i < arraypos) {    
                // multiply integers using fast integer (FFT) arithmetic
                // Note that gnu-mp does this automatically when required

                // first polynomial is at Tree[arr1[i]] of degree polydegrees[arr2[i]]
                // second polynomial is at Tree[arr1[i+1]] of degree polydegrees[arr2[i+1]]

                // convert Tree[arr1[i]] to an integer intP
                long numcoeffs = arr2[i] + 1;
                for (long j = 0; j < numcoeffs*ldiv8; j++) prepoly[j] = 0;
                for (long j = 0; j < numcoeffs; j++)
                    mpz_export(prepoly + j*ldiv8, NULL, -1, 1, 0, 0, Tree[arr1[i] - j]);
                // now that we have the bytes making up intP in the array prepoly,
                // construct intP
                mpz_import(intP, numcoeffs*ldiv8, -1, 1, 0, 0, prepoly);

                // convert Tree[arr1[i+1]] to an integer intQ
                numcoeffs = arr2[i+1] + 1;
                for (long j = 0; j < numcoeffs*ldiv8; j++) prepoly[j] = 0;
                for (long j = 0; j < numcoeffs; j++) 
                    mpz_export(prepoly + j*ldiv8, NULL, -1, 1, 0, 0, Tree[arr1[i+1] - j]);
                // now that we have the bytes making up intQ in the array prepoly,
                // construct intQ
                mpz_import(intQ, numcoeffs*ldiv8, -1, 1, 0, 0, prepoly);	

                // now we do the actual integer multiplication
                mpz_mul(intPQ, intP, intQ);

                // extract bytes of integer product into array prepoly
                numcoeffs = arr2[i] + arr2[i+1] + 1;
                for (long j = 0; j < numcoeffs*ldiv8; j++) prepoly[j] = 0;
                mpz_export(prepoly, NULL, -1, 1, 0, 0, intPQ);

                // now extract coefficients and reduce mod n, set Tree[treepos]
                for (long j = 0; j < numcoeffs; j++) {
                    // Extract jth coefficient and reduce it mod n
                    mpz_import(coeff, ldiv8, -1, 1, 0, 0, prepoly + j*ldiv8);
                    mpz_mod(coeff, coeff, n);
                    mpz_set(Tree[treepos-j], coeff);
                }

                // then we set arr1[i/2] = treepos and arr2[i/2] to the degree of the product above
                arr1[i/2] = treepos;
                arr2[i/2] = numcoeffs - 1;

                // advance to next polynomial position in tree
                treepos = treepos - numcoeffs;
            }
            else {
                arr1[i/2] = arr1[i];
                arr2[i/2] = arr2[i];
            }
        }
        
		mpz_set_ui(intPQ, 0); mpz_set_ui(intQ, 0); mpz_set_ui(intP, 0);

        arraypos = arraypos/2;
    }
    // release allocated memory
	mpz_clear(intPQ);
	mpz_clear(intQ);
	mpz_clear(intP);

	mpz_clear(coeff);
    delete[] prepoly;
    mpz_clear(R);
    mpz_clear(dnn);
    delete[] arr2;
    delete[] arr1;
	mpz_clear(one);
}

/*
 * F            pointer to output array of mpz_t integers representing polynomial F
 * roots        pointer to input array of roots of polynomial F
 * k            number of roots of F (degree of F)
 * n            we work in the ring of integers modulo n
 */
void PolyIntProductTree(mpz_t* G, mpz_t* roots, long* polydegrees, long numroots, mpz_t n, long numnodes)
{
    mpz_t one; mpz_init_set_ui(one, 1);
    mp_bitcnt_t l;
	mp_bitcnt_t bits_in_n = mpz_sizeinbase(n, 2) + 64;

    // Create product arrays, where each element of arr1 holds 
    // the position in the tree array of a product tree node polynomial
    // and arr2 the corresponding polynomial degree
    long* arr1 = new long[numroots]();
    long* arr2 = new long[numroots]();
	long treepos = 0;
    for (long i = 0; i < numroots; i++) {
        arr1[i] = treepos;
        arr2[i] = polydegrees[numnodes - 1 - i];
		treepos += 2;
    }

    // Set tree.  Note this is just one level of the tree
    mpz_t* Tree = new mpz_t[2*numroots];
    for (long i = 0; i < 2*numroots; i++) {
        mpz_init2(Tree[i], bits_in_n);
        mpz_set_ui(Tree[i], 1);
    }
    
    // Set polynomials at youngest nodes of tree
	// Note that we have already set every element to 1
    treepos = 0;
    for (long i = 0; i < numroots; i++) {
        mpz_set_ui(Tree[treepos], 0);
        mpz_sub(Tree[treepos], Tree[treepos], roots[i]);	// constant term -roots[i]
        mpz_mod(Tree[treepos], Tree[treepos], n);			// reduce mod n
        treepos += 2;
    }

    long d = polydegrees[0];
    d = d+1;	// both polynomials have degree strictly less than d now
    mpz_t dnn; mpz_init_set(dnn, n); mpz_mul(dnn, dnn, n); mpz_mul_ui(dnn, dnn, d); // compute d*n^2
    mpz_t R; mpz_init(R);
    l = getpow(dnn, R);		// get l such that 2^(l-1) < d*n^2 < 2^l = R
    while ((l%8)!=0) l++;	// increase l to byte boundary
    long ldiv8 = l / 8;		// number of bytes
    long prepolynumbytes = (numroots+1)*ldiv8;
    char* prepoly = new char[prepolynumbytes];		// *** will this take up a lot of memory? ***

	// compute number of bytes to store Tree[1]
	d = polydegrees[1];
    d = d+1;	// polynomial has degree strictly less than d now
    mpz_set(dnn, n); mpz_mul(dnn, dnn, n); mpz_mul_ui(dnn, dnn, d); // compute d*n^2
    l = getpow(dnn, R);		// get l such that 2^(l-1) < d*n^2 < 2^l = R
    while ((l%8)!=0) l++;	// increase l to byte boundary
    ldiv8 = l / 8;		// number of bytes
    long Tree1numbytes = d*ldiv8;

    // we use one structure, a product array,
    // which represents a level in the tree. The problem to overcome is level order traversal
    // of an in-order tree
    
    // Compute product tree
    mpz_t coeff;
    mpz_init(coeff);
    long arraypos = numroots - 1;
	mpz_t intP; mpz_init2(intP, Tree1numbytes*8 + 64);
	mpz_t intQ; mpz_init2(intQ, Tree1numbytes*8 + 64);
	mpz_t intPQ; mpz_init2(intPQ, prepolynumbytes*8 + 64);
    while (arraypos > 0) {        
        // 
        d = arr2[arraypos];
        if (arraypos > 0)
        if (arr2[arraypos-1] > d)
            d = arr2[arraypos-1];
        d = d+1;	// both polynomials have degree strictly less than d now
        mpz_set(dnn, n); mpz_mul(dnn, dnn, n); mpz_mul_ui(dnn, dnn, d); // compute d*n^2
        l = getpow(dnn, R);		// get l such that 2^(l-1) < d*n^2 < 2^l = R
        while ((l%8)!=0) l++;	// increase l to byte boundary
        ldiv8 = l / 8;		// number of bytes
        long prepolynumbytes = (numroots+1)*ldiv8;
		                
        for (long i = 0; i <= arraypos; i += 2) {
            if(i < arraypos) {    
                // multiply integers using fast integer (FFT) arithmetic
                // Note that gnu-mp does this automatically when required

                // first polynomial is at Tree[arr1[i]] of degree polydegrees[arr2[i]]
                // second polynomial is at Tree[arr1[i+1]] of degree polydegrees[arr2[i+1]]

                // convert Tree[arr1[i]] to an integer intP
                long numcoeffs = arr2[i] + 1;
                for (long j = 0; j < numcoeffs*ldiv8; j++) prepoly[j] = 0;
                for (long j = 0; j < numcoeffs; j++)
                mpz_export(prepoly + j*ldiv8, NULL, -1, 1, 0, 0, Tree[arr1[i] + j]);
                // now that we have the bytes making up intP in the array prepoly,
                // construct intP
                mpz_import(intP, numcoeffs*ldiv8, -1, 1, 0, 0, prepoly);

                // convert Tree[arr1[i+1]] to an integer intQ
                numcoeffs = arr2[i+1] + 1;
                for (long j = 0; j < numcoeffs*ldiv8; j++) prepoly[j] = 0;
                for (long j = 0; j < numcoeffs; j++) 
                mpz_export(prepoly + j*ldiv8, NULL, -1, 1, 0, 0, Tree[arr1[i+1] + j]);
                // now that we have the bytes making up intQ in the array prepoly,
                // construct intQ
                mpz_import(intQ, numcoeffs*ldiv8, -1, 1, 0, 0, prepoly);	

                // now we do the actual integer multiplication
                mpz_mul(intPQ, intP, intQ);

                // extract bytes of integer product into array prepoly
                numcoeffs = arr2[i] + arr2[i+1] + 1;
				for (long j = 0; j < numcoeffs*ldiv8; j++) prepoly[j] = 0;
                mpz_export(prepoly, NULL, -1, 1, 0, 0, intPQ);

                // now extract coefficients and reduce mod n, set Tree[treepos]
                for (long j = 0; j < numcoeffs; j++) {
                    // Extract jth coefficient and reduce it mod n
                    mpz_import(coeff, ldiv8, -1, 1, 0, 0, prepoly + j*ldiv8);
                    mpz_mod(coeff, coeff, n);
                    mpz_set(Tree[arr1[i] + j], coeff);
                }

                // then we set arr1[i/2] = position of first polynomial of product pair
                // and arr2[i/2] to the degree of the product
                arr1[i/2] = arr1[i];
                arr2[i/2] = numcoeffs - 1;
            }
            else {
                arr1[i/2] = arr1[i];
                arr2[i/2] = arr2[i];
            }
        }
		
		mpz_set_ui(intPQ, 0); mpz_set_ui(intQ, 0); mpz_set_ui(intP, 0);
        
        arraypos = arraypos/2;
    }

    // copy coefficients of last product to G
    for (long i = 0; i < numroots + 1; i++) mpz_set(G[i], Tree[i]);
    
    // release allocated memory
	mpz_clear(intPQ);
	mpz_clear(intQ);
	mpz_clear(intP); 

    mpz_clear(coeff);
    delete[] prepoly;
    for (long i = 0; i < 2*numroots; i++) mpz_clear(Tree[i]);
    delete[] Tree;
	mpz_clear(R);
    mpz_clear(dnn);
    delete[] arr2;
    delete[] arr1;
	mpz_clear(one);
}

/*
 */
// Note - the approach to software engineering used is the CVCVC approach - 
//
//  Construct frame
//  Verify logic
//  Correct syntax (get to compile)
//  Verify execution
//  Clean up memory management etc
//
//  (I made that up!)
//
//  The degree of f is strictly less than d.
//  Also note that the input polynomial f must be 
//  provided as a reversed, monic polynomial, i.e. the constant term is 1
//
void PolyReciprocal(mpz_t* f, uint64_t d, mpz_t n, mpz_t* output)
{
	mp_bitcnt_t bits_in_n = mpz_sizeinbase(n, 2) + 64;

    mpz_t* gk = new mpz_t[d];   // we accomodate the maximum size for gk
    for (uint64_t i = 0; i < d; i++) mpz_init2(gk[i], bits_in_n);
        
    mpz_t* fm = new mpz_t[d];   // we accomodate the maximum size for fm
    for (uint64_t i = 0; i < d; i++) mpz_init2(fm[i], bits_in_n);

    mpz_t* fmgk = new mpz_t[2*d];   // we accomodate the maximum size for fmgk
    for (uint64_t i = 0; i < 2*d; i++) mpz_init2(fmgk[i], bits_in_n);
    
    mpz_t dnn; mpz_init(dnn);
    mpz_set(dnn, n); mpz_mul(dnn, dnn, n); mpz_mul_ui(dnn, dnn, d);    // compute d*n^2
    mpz_t R; mpz_init(R);
    mp_bitcnt_t l = getpow(dnn, R);     // compute number of bits required to store d*n^2
    while ((l%8)!=0) l++;     // pad b until we are at a byte boundary
    long ldiv8 = l / 8;      // number of bytes required to store d*n^2
    // prepolygk, prepolyfm, prepolyfmgk must store up to d coefficients, each with max size d*n^2
    // note that these must allow space for Kronecker substitution in number of bytes
    char* prepolygk = new char[d*ldiv8]();
    char* prepolyfm = new char[d*ldiv8]();
    char* prepolyfmgk = new char[2*d*ldiv8]();
    
    // set g0 = 1
    mpz_set_ui(gk[0], 1);
    
    // we now change the meaning of l to be ceil(log2(d)) (note just to have variable name l)
    mpz_t d2; mpz_init_set_ui(d2, d);
    l = getpow(d2, R);
    mpz_clear(d2);

    uint64_t m = 1, q, r;
    uint64_t pow2tolsubk = 1<<l;        // 2^(l-k) where k is initially 0
    mpz_t coeff; mpz_init2(coeff, bits_in_n);
	mpz_t coeff2; mpz_init2(coeff2, bits_in_n);
    mpz_t intgk; mpz_init2(intgk, d*ldiv8*8 + 64);
    mpz_t intfm; mpz_init2(intfm, d*ldiv8*8 + 64);
    mpz_t intfmgk; mpz_init2(intfmgk, 2*d*ldiv8*8 + 64);
    for (uint64_t k = 1; k <= l; k++) {
        
        // void mpz_import (rop, count, order, size, endian, nails, *op)
        // void * mpz_export (*rop, *countp, order, size, endian, nails, op)
        
        // convert g(k-1) to an integer using Kronecker substitution
		for (uint64_t j = 0; j < m*ldiv8; j++) prepolygk[j] = 0;	// clear array first
        for (uint64_t j = 0; j < m; j++) {
            mpz_set(coeff, gk[j]);
            // jump to location of jth coefficient in prepolygk and write coeff
            mpz_export(prepolygk + j*ldiv8, NULL, -1, 1, 0, 0, coeff);
        }
        // now that we have the bytes making up gk in the array prepolygk,
        // construct intgk
        mpz_import(intgk, m*ldiv8, -1, 1, 0, 0, prepolygk);
        
        // compute m = ceil(d/2^(l-k))
        pow2tolsubk = pow2tolsubk / 2;	// as k increases, we subtract 1 from 2^(l-k)
        r = d % pow2tolsubk;
        q = (d-r)/pow2tolsubk;
        m = q;
        if (r > 0) m = q+1;
        
        // convert fm = f (mod x^m) to an integer using Kronecker substitution
        for (uint64_t j = 0; j < m*ldiv8; j++) prepolyfm[j] = 0;	// clear array first
        for (uint64_t j = 0; j < m; j++) {
            mpz_set(coeff, f[j]);		// constant term (=1) of tree root is at position 0
            // jump to location of jth coefficient in prepoly and write coeff
            mpz_export(prepolyfm + j*ldiv8, NULL, -1, 1, 0, 0, coeff);
        }
        // now that we have the bytes making up fm in the array prepolyfm,
        // construct polyintfm
        mpz_import(intfm, m*ldiv8, -1, 1, 0, 0, prepolyfm);
        
        // multiply fm and gk using (FFT) integer multiplication
        mpz_mul(intfmgk, intfm, intgk);

        // export bytes of intfmgk to array
        for (long j = 0; j < (k+m)*ldiv8; j++) prepolyfmgk[j] = 0;	// clear coefficients
        mpz_export(prepolyfmgk, NULL, -1, 1, 0, 0, intfmgk);

        // reduce mod x^m, i.e. discard all coefficients >= x^m
        // this is accomplished by setting all bytes in prepolyfmgk
		// which correspond to terms in x^m or higher to zero
        for (uint64_t j = m*ldiv8; j < 2*d*ldiv8; j++) prepolyfmgk[j] = 0;

        // reduce each coefficient of fmgk mod n
        for (uint64_t j = 0; j < m; j++) {
            mpz_import(coeff, ldiv8, -1, 1, 0, 0, prepolyfmgk + j*ldiv8);
            mpz_mod(coeff, coeff, n);
            for (long i = 0; i < ldiv8; i++) prepolyfmgk[j*ldiv8+i] = 0;	// clear coefficient
            mpz_export(prepolyfmgk + j*ldiv8, NULL, -1, 1, 0, 0, coeff);
        }

        // subtract 1 from fmgk, i.e. from the constant term
        mpz_import(coeff, ldiv8, -1, 1, 0, 0, prepolyfmgk + 0*ldiv8);
        mpz_sub_ui(coeff, coeff, 1);
        mpz_mod(coeff, coeff, n);
        for (long i = 0; i < ldiv8; i++) prepolyfmgk[0*ldiv8+i] = 0;	// clear coefficient
        mpz_export(prepolyfmgk + 0*ldiv8, NULL, -1, 1, 0, 0, coeff);
		
        // now that we have the bytes making up fmgk-1 in the array prepolyfmgk,
        // construct intfmgk
        mpz_import(intfmgk, m*ldiv8, -1, 1, 0, 0, prepolyfmgk);
        
        // multiply fmgk-1 and gk using integer multiplication
        mpz_mul(intfmgk, intfmgk, intgk);
        // export bytes of intfmgk to array
        mpz_export(prepolyfmgk, NULL, -1, 1, 0, 0, intfmgk);
        // reduce mod x^m, i.e. discard all coefficients >= x^m
        for (uint64_t j = m*ldiv8; j < 2*d*ldiv8; j++) prepolyfmgk[j] = 0;
                
        // At this stage, we have gk and (fm*gk-1)*gk.  We must subtract the latter from the former.
        // we do this one coefficient at a time, reducing mod n each time.
        // The degree of gk and of (fm*gk-1)*fk is m-1, and so gk has m coefficients
        for (uint64_t j = 0; j < m; j++) {
            // Extract jth coefficient and reduce it mod n
            mpz_import(coeff, ldiv8, -1, 1, 0, 0, prepolygk + j*ldiv8);
            mpz_mod(coeff, coeff, n);
            mpz_import(coeff2, ldiv8, -1, 1, 0, 0, prepolyfmgk + j*ldiv8);
            mpz_mod(coeff2, coeff2, n);
            mpz_sub(coeff, coeff, coeff2);
            mpz_mod(coeff, coeff, n);
            mpz_set(gk[j], coeff);
        }
    }
    
    // Finally we have that gk will have degree d-1 and we output this
    for (uint64_t i = 0; i < d; i++) mpz_set(output[i], gk[i]);

	// release memory
    mpz_clear(intfmgk);
	mpz_clear(intfm);
	mpz_clear(intgk);
	mpz_clear(coeff2);
	mpz_clear(coeff);
    delete[] prepolyfmgk;
    delete[] prepolyfm;
    delete[] prepolygk;

	mpz_clear(R); mpz_clear(dnn);

    for (uint64_t i = 0; i < 2*d; i++) mpz_clear(fmgk[i]);	
    delete[] fmgk;
    for (uint64_t i = 0; i < d; i++) mpz_clear(fm[i]);
    delete[] fm;
    for (uint64_t i = 0; i < d; i++) mpz_clear(gk[i]);
    delete[] gk;
}

/*
 */
void PolyIntProduct(mpz_t* F, mpz_t* G, uint64_t degF, uint64_t degG, mpz_t n, mpz_t* out)
{
    // setup
	mp_bitcnt_t bits_in_n = mpz_sizeinbase(n, 2) + 64;
    uint64_t d = degF;
    if (degG > degF) d = degG;
    mpz_t dnn; mpz_init_set(dnn,n);
    mpz_mul(dnn, dnn, n); mpz_mul_ui(dnn, dnn, d);    // compute d*n^2
    mpz_t R; mpz_init(R);
    mp_bitcnt_t b = getpow(dnn, R);     // compute number of bits required to store d*n^2
    while ((b%8)!=0) b++;     // pad b until we are at a byte boundary
    long bdiv8 = b / 8;      // number of bytes required to store d*n^2
    
    // prepolygk, prepolyfm, prepolyfmgk must store up to d coefficients, each with max size d*n^2
    // note that these must allow space for Kronecker substitution in number of bytes
    char* prepolyF = new char[(d+1)*bdiv8]();
    char* prepolyG = new char[(d+1)*bdiv8]();
    char* prepolyFG = new char[(2*d+1)*bdiv8]();
    mpz_t coeff; mpz_init2(coeff, bits_in_n);
	mpz_t coeff2; mpz_init2(coeff2, bits_in_n);
    mpz_t intF; mpz_init2(intF, (d+1)*bdiv8*8 + 64);
    mpz_t intG; mpz_init2(intG, (d+1)*bdiv8*8 + 64);
    mpz_t intFG; mpz_init2(intFG, (2*d+1)*bdiv8*8 + 64);
    
    // convert F to an integer using Kronecker substitution
    for (uint64_t j = 0; j < degF+1; j++) {
        mpz_set(coeff, F[j]);
        // jump to location of jth coefficient in prepolyG and write coeff
        mpz_export(prepolyF + j*bdiv8, NULL, -1, 1, 0, 0, coeff);
    }
    // now that we have the bytes making up G in the array prepolyG,
    // construct intG
    mpz_import(intF, (d+1)*bdiv8, -1, 1, 0, 0, prepolyF);
  
    // convert G to an integer using Kronecker substitution
    for (uint64_t j = 0; j < degG+1; j++) {
        mpz_set(coeff, G[j]);
        // jump to location of jth coefficient in prepolyG and write coeff
        mpz_export(prepolyG + j*bdiv8, NULL, -1, 1, 0, 0, coeff);
    }
    // now that we have the bytes making up G in the array prepolyG,
    // construct intG
    mpz_import(intG, (d+1)*bdiv8, -1, 1, 0, 0, prepolyG);
	
    // multiply intF and intG using (FFT) integer multiplication
    mpz_mul(intFG, intF, intG);

    mpz_export(prepolyFG, NULL, -1, 1, 0, 0, intFG);
	
    // read product FG as a polynomial in 2^b of degree degF+degG
    for (uint64_t j = 0; j < degF+degG+1; j++) {
        // jump to location of jth coefficient in prepoly and write coeff
        mpz_import(coeff, bdiv8, -1, 1, 0, 0, prepolyFG + j*bdiv8);
        // reduce coefficient mod n
        mpz_mod(coeff, coeff, n);
        // ouput coefficient
        mpz_set(out[j], coeff);
    }
	    
    mpz_clear(intFG);
    mpz_clear(intG);
    mpz_clear(intF);
    mpz_clear(coeff2);
    mpz_clear(coeff);	
    delete[] prepolyFG;
    delete[] prepolyG;
    delete[] prepolyF;
	mpz_clear(R);
	mpz_clear(dnn);
}

/*
 */
void TreePolyIntProduct(mpz_t* F, mpz_t* Tree, long treePolyStart, uint64_t degF, uint64_t degG, mpz_t n, mpz_t* out)
{
    // setup
	mp_bitcnt_t bits_in_n = mpz_sizeinbase(n, 2) + 64;
    uint64_t d = degF;
    if (degG > degF) d = degG;
    mpz_t dnn; mpz_init(dnn);
    mpz_set(dnn, n); mpz_mul(dnn, dnn, n); mpz_mul_ui(dnn, dnn, d);    // compute d*n^2
    mpz_t R; mpz_init(R);
    mp_bitcnt_t b = getpow(dnn, R);     // compute number of bits required to store d*n^2
    while ((b%8)!=0) b++;     // pad b until we are at a byte boundary
    long bdiv8 = b / 8;      // number of bytes required to store d*n^2
    
    // prepolygk, prepolyfm, prepolyfmgk must store up to d coefficients, each with max size d*n^2
    // note that these must allow space for Kronecker substitution in number of bytes
    char* prepolyF = new char[(d+1)*bdiv8]();
    char* prepolyG = new char[(d+1)*bdiv8]();
    char* prepolyFG = new char[(2*d+1)*bdiv8]();
    mpz_t coeff; mpz_init2(coeff, bits_in_n);
    mpz_t intF; mpz_init2(intF, (d+1)*bdiv8*8 + 64);
    mpz_t intG; mpz_init2(intG, (d+1)*bdiv8*8 + 64);
    mpz_t intFG; mpz_init2(intFG, (2*d+1)*bdiv8*8 + 64);
    
    // convert F to an integer using Kronecker substitution
    for (uint64_t j = 0; j < degF+1; j++) {
        mpz_set(coeff, F[j]);
        // jump to location of jth coefficient in prepolyG and write coeff
        mpz_export(prepolyF + j*bdiv8, NULL, -1, 1, 0, 0, coeff);
    }
    // now that we have the bytes making up G in the array prepolyG,
    // construct intG
    mpz_import(intF, (d+1)*bdiv8, -1, 1, 0, 0, prepolyF);
  
    // convert G to an integer using Kronecker substitution
    for (uint64_t j = 0; j < degG+1; j++) {
        mpz_set(coeff, Tree[treePolyStart - j]);
        // jump to location of jth coefficient in prepolyG and write coeff
        mpz_export(prepolyG + j*bdiv8, NULL, -1, 1, 0, 0, coeff);
    }
    // now that we have the bytes making up G in the array prepolyG,
    // construct intG
    mpz_import(intG, (d+1)*bdiv8, -1, 1, 0, 0, prepolyG);
	
    // multiply intF and intG using (FFT) integer multiplication
    mpz_mul(intFG, intF, intG);

	for (long j = 0; j < (2*d+1)*bdiv8; j++) prepolyFG[j] = 0;
    mpz_export(prepolyFG, NULL, -1, 1, 0, 0, intFG);
	
    // read product FG as a polynomial in 2^b of degree degF+degG
    for (uint64_t j = 0; j < degF+degG+1; j++) {
        // jump to location of jth coefficient in prepoly and write coeff
        mpz_import(coeff, bdiv8, -1, 1, 0, 0, prepolyFG + j*bdiv8);
        // reduce coefficient mod n
        mpz_mod(coeff, coeff, n);
        // ouput coefficient
        mpz_set(out[j], coeff);
    }
	
    mpz_clear(intFG);
    mpz_clear(intG);
    mpz_clear(intF);
    mpz_clear(coeff);
    delete[] prepolyFG;
    delete[] prepolyG;
    delete[] prepolyF;
	mpz_clear(R);
	mpz_clear(dnn);
}

/* ScalarMultiplyHessian
 * 
 * Multiply a point P = (X0:Y0:Z0) on a twisted Hessian curve by a scalar multiple S
 * d	d parameter of twisted Hessian curve
 * a	a parameter of twisted Hessian curve
 * P	point on curve to multiply, in projective coordinates
 * SP   output point [S]P
 * n	we work modulo n
 * S	scalar multiple
 * 
 * Equation of curve: a*X^3 + Y^3 + Z^3 = d*X*Y*Z
 * 
 *  Doubling formula for (X1:Y1:Z1) - cost 7M + 1S + 1d
 *      B = Y1*Z1
 *      B2 = 2*B
 *      G = Y1+Z1
 *      A = G^2-B
 *      C = (A-B2)*G
 *      D = A*(Z1-Y1)
 *      E = 3*C-d*X1*B2
 *      X3 = minustwo*X1*D
 *      Y3 = (D-E)*Z1
 *      Z3 = (D+E)*Y1
 * 
 *  Addition formula for (X1:Y1:Z1) + (X0:Y0:Z0) - cost 12M + 1a but more like 6M in practice since a,X0,Y0,Z0 are small
 *      A = X1*Z0
 *      B = Z1*Z0
 *      C = Y1*X0
 *      D = Y1*Y0
 *      E = Z1*Y0
 *      F = a*X1*X0
 *      X3 = A*B-C*D
 *      Y3 = D*E-F*A
 *      Z3 = F*C-B*E 
*/
void ScalarMultiplyHessian(mpz_t d, mpz_t a, epoint* P, epoint* SP, mpz_t n, mpz_t S) {
    
    mpz_t A, B, C, D, E, F, G, H, Hx2;
    mpz_t B2, G2, AmB2, Z1mY1, C3, dX1B2, X1D, DmE, DaE;
    mpz_t AB, CD, DE, FA, FC, BE, mulmod, X, Y, Z;
    
    mpz_init(A); mpz_init(B); mpz_init(C); mpz_init(D); mpz_init(E); mpz_init(F); mpz_init(G);
    mpz_init(H); mpz_init(Hx2); mpz_init(B2); mpz_init(G2); mpz_init(AmB2);
    mpz_init(Z1mY1); mpz_init(C3); mpz_init(dX1B2); mpz_init(X1D); mpz_init(DmE); mpz_init(DaE);
    mpz_init(AB); mpz_init(CD);mpz_init(DE); mpz_init(FA); mpz_init(FC); mpz_init(BE); 
    mpz_init(mulmod); mpz_init(X); mpz_init(Y); mpz_init(Z);
 
    mpz_mod(SP->x, P->x, n);
    mpz_mod(SP->y, P->y, n);
    mpz_mod(SP->z, P->z, n);
    
    // Compute exponentiation chain    
    expchain* chain = new expchain();
    mpz_set(chain->S, S);
    chain->L = (unsigned long long)mpz_sizeinbase(chain->S, 2); // exact for base = power of 2
      
    // Scalar multiplication using double & add algorithm
    for (unsigned long long i = 2; i <= chain->L; i++) {
        // double (X1:Y1:Z1) = (SP->x:SP->y:SP->z)
        mpz_mul(mulmod, SP->y, SP->z); mpz_mod(B, mulmod, n);
        mpz_mul_2exp(B2, B, 1);
        mpz_add(G, SP->y, SP->z);       
        mpz_mul(mulmod, G, G); mpz_mod(G2, mulmod, n);
        mpz_sub(A, G2, B);
        mpz_sub(AmB2, A, B2);
        mpz_mul(mulmod, AmB2, G); mpz_mod(C, mulmod, n);
        mpz_sub(Z1mY1, SP->z, SP->y);
        mpz_mul(mulmod, A, Z1mY1); mpz_mod(D, mulmod, n);
        mpz_mul_ui(C3, C, 3);
        mpz_mul(dX1B2, d, SP->x);
        mpz_mul(mulmod, dX1B2, B2); mpz_mod(dX1B2, mulmod, n);
        mpz_sub(E, C3, dX1B2);
        mpz_mul_si(mulmod, SP->x, -2);
        mpz_mul(X, mulmod, D);
        mpz_sub(DmE, D, E);
        mpz_add(DaE, D, E);
        mpz_mul(Y, DmE, SP->z);
        mpz_mul(Z, DaE, SP->y);
        mpz_mod(SP->x, X, n);
        mpz_mod(SP->y, Y, n);
        mpz_mod(SP->z, Z, n);
        if (mpz_tstbit(chain->S, chain->L - i) == 1) {
            // add (X1:Y1:Z1) = (SP->x:SP->y:SP->z) and (X0:Y0:Z0) = (P->x:S->y:P->z)
            mpz_mul(A, SP->x, P->z);
            mpz_mul(B, SP->z, P->z);
            mpz_mul(C, SP->y, P->x);
            mpz_mul(D, SP->y, P->y);
            mpz_mul(E, SP->z, P->y);
            mpz_mul(F, a, SP->x);
            mpz_mul(F, F, P->x);
			mpz_mul(AB, A, B); //mpz_mod(AB, AB, n);
			mpz_mul(CD, C, D); //mpz_mod(CD, CD, n);
			mpz_mul(DE, D, E); //mpz_mod(DE, DE, n);
			mpz_mul(FA, F, A); //mpz_mod(FA, FA, n);
			mpz_mul(FC, F, C); //mpz_mod(FC, FC, n);
			mpz_mul(BE, B, E); //mpz_mod(BE, BE, n);
            mpz_sub(X, AB, CD);
            mpz_sub(Y, DE, FA);
            mpz_sub(Z, FC, BE);
            mpz_mod(SP->x, X, n);
            mpz_mod(SP->y, Y, n);
            mpz_mod(SP->z, Z, n);
        }
    }
    delete chain;
	    
    mpz_clear(A); mpz_clear(B); mpz_clear(C); mpz_clear(D); mpz_clear(E); mpz_clear(F); mpz_clear(G);
    mpz_clear(H); mpz_clear(Hx2); mpz_clear(B2); mpz_clear(G2); mpz_clear(AmB2);
    mpz_clear(Z1mY1); mpz_clear(C3); mpz_clear(dX1B2); mpz_clear(X1D); mpz_clear(DmE);  mpz_clear(DaE);
    mpz_clear(AB); mpz_clear(CD);mpz_clear(DE); mpz_clear(FA); mpz_clear(FC); mpz_clear(BE);
    mpz_clear(mulmod); mpz_clear(X); mpz_clear(Y); mpz_clear(Z);
}

//   Addition on a twisted Hessian curve
//
//      a*X^3 + Y^3 + Z^3 = d*X*Y*Z
//
void HEnAdd(mpz_t d, mpz_t a, epoint* P, epoint* Q, epoint* R, mpz_t n) {
        
    mpz_t A, B, C, D, E, F;
    mpz_t AB, CD, DE, FA, FC, BE, mulmod, X, Y, Z;
    
    mpz_init(A); mpz_init(B); mpz_init(C); mpz_init(D); mpz_init(E); mpz_init(F);
    mpz_init(AB); mpz_init(CD);mpz_init(DE); mpz_init(FA); mpz_init(FC); mpz_init(BE); 
    mpz_init(mulmod); mpz_init(X); mpz_init(Y); mpz_init(Z);

    // add (X1:Y1:Z1) = (SP->x:SP->y:SP->z) and (X0:Y0:Z0) = (P->x:S->y:P->z)
    mpz_mul(A, P->x, Q->z);
    mpz_mul(B, P->z, Q->z);
    mpz_mul(C, P->y, Q->x);
    mpz_mul(D, P->y, Q->y);
    mpz_mul(E, P->z, Q->y);
    mpz_mul(F, a, P->x);
    mpz_mul(F, F, Q->x);
    mpz_mul(AB, A, B); //mpz_mod(AB, AB, n);
    mpz_mul(CD, C, D); //mpz_mod(CD, CD, n);
    mpz_mul(DE, D, E); //mpz_mod(DE, DE, n);
    mpz_mul(FA, F, A); //mpz_mod(FA, FA, n);
    mpz_mul(FC, F, C); //mpz_mod(FC, FC, n);
    mpz_mul(BE, B, E); //mpz_mod(BE, BE, n);
    mpz_sub(X, AB, CD);
    mpz_sub(Y, DE, FA);
    mpz_sub(Z, FC, BE);
    mpz_mod(R->x, X, n);
    mpz_mod(R->y, Y, n);
    mpz_mod(R->z, Z, n);
    
    mpz_clear(Z); mpz_clear(Y); mpz_clear(X); mpz_clear(mulmod);
	mpz_clear(BE); mpz_clear(FC); mpz_clear(FA); mpz_clear(DE); mpz_clear(CD); mpz_clear(AB);
	mpz_clear(F); mpz_clear(E); mpz_clear(D); mpz_clear(C); mpz_clear(B); mpz_clear(A); 
}


void PolyReduceModF(mpz_t* HGj, long degHGj, mpz_t* F, long degG, long degF, mpz_t n, mpz_t* revinvF, mpz_t* H)
{
	// we keep a reversed copy of F = Tree[0]
    mpz_t* revF = new mpz_t[degF+1];
    for (long i = 0; i < degF+1; i++) mpz_init_set(revF[i], F[degF-i]);
	for (long i = 0; i < degF+1; i++)
		getstr(revF[i]);
	cout << "\n(revF)----------\n";
	mpz_t* HGjdivF = new mpz_t[2*degG+degF+1];
    for (long i = 0; i < 2*degG+degF+1; i++) mpz_init(HGjdivF[i]);
    // Initialize array to hold q
    mpz_t* q = new mpz_t[degG+1];
    for (long i = 0; i < degG+1; i++) mpz_init(q[i]);
    // Initialize array to hold q*F
    mpz_t* qF = new mpz_t[degG + degF + 1];
    for (long i = 0; i < degG+degF+1; i++) mpz_init(qF[i]);

    // now reduce HGj (mod F) using method shown in Fast Multiplication and its Applications (Bernstein)
    // first get degree of HGj, which might be less than 2*degG
    //long degHGj = getPolyDegree(HGj, degG*2); // declaration of 'long int degHGj' shadows a parameter
    mpz_t coeff; mpz_init(coeff);
    // now reverse HGj
    for (long i = 0; i < (degHGj+1)/2; i++) {
        mpz_set(coeff, HGj[i]);
        mpz_set(HGj[i], HGj[degHGj-i]);
        mpz_set(HGj[degHGj-i], coeff);
    }
	for (long i = 0; i < degHGj+1; i++)
		getstr(HGj[i]);
	cout << "\n(revHGj)----------\n";
    PolyIntProduct(HGj, revinvF, degHGj, degF, n, HGjdivF);	
	for (long i = 0; i < 2*degG+degF+1; i++)
		getstr(HGjdivF[i]);
	cout << "\n(HGjdivF)----------\n";
    // Note that deg HGj < e.
    // reduce mod x^(e-d), i.e. discard all powers of x not less than e-d
    // where e-1 is the degree of HGj and d that of 1/F (precision degF+1)
    // here e = degHGj+1, d = degF, so e-d = degHGj+1-degF
    // we also reverse at the same time to give polynomial q
    // deg(qF) = deg(q) + deg(F) = degHGj-degF + degF = degHGj
    for (long i = 0; i < degHGj+1-degF; i++) mpz_set(q[i], HGjdivF[degHGj-degF-i]);
	for (long i = 0; i < degHGj+1-degF; i++)
		getstr(q[i]);
	cout << "\n(q)----------\n";
    // "un"-reverse HGj
    for (long i = 0; i < (degHGj+1)/2; i++) {
        mpz_set(coeff, HGj[i]);
        mpz_set(HGj[i], HGj[degHGj-i]);
        mpz_set(HGj[degHGj-i], coeff);
    }            
    // now compute r = HGj - q*F.  This is H*Gj (mod F)
    PolyIntProduct(q, revF, degG, degF, n, qF);
	for (long i = 0; i < degG + degF + 1; i++)
		getstr(qF[i]);
	cout << "\n(qF)----------\n";
    // subtract qF from HGj one coefficient at a time, for degF coefficients
    for (long i = 0; i < degF; i++) {
        mpz_sub(coeff, HGj[i], qF[i]);
        mpz_mod(coeff, coeff, n);
        mpz_set(H[i], coeff);
    }

	for (long i = 0; i < degF; i++)
		getstr(H[i]);
	cout << "\n----------\n";
}
