/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysds.runtime.compress.lib;

import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
import org.apache.sysds.runtime.compress.CompressedMatrixBlockFactory;
import org.apache.sysds.runtime.compress.colgroup.AColGroup;
import org.apache.sysds.runtime.compress.colgroup.ASDCZero;
import org.apache.sysds.runtime.compress.colgroup.ColGroupConst;
import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
import org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
import org.apache.sysds.runtime.compress.lib.CLALibScalar;
import org.apache.sysds.runtime.compress.lib.CLALibUtils;
import org.apache.sysds.runtime.data.DenseBlock;
import org.apache.sysds.runtime.data.SparseBlock;
import org.apache.sysds.runtime.functionobjects.Divide;
import org.apache.sysds.runtime.functionobjects.Minus;
import org.apache.sysds.runtime.functionobjects.Minus1Multiply;
import org.apache.sysds.runtime.functionobjects.MinusMultiply;
import org.apache.sysds.runtime.functionobjects.Multiply;
import org.apache.sysds.runtime.functionobjects.Plus;
import org.apache.sysds.runtime.functionobjects.PlusMultiply;
import org.apache.sysds.runtime.functionobjects.ValueComparisonFunction;
import org.apache.sysds.runtime.functionobjects.ValueFunction;
import org.apache.sysds.runtime.matrix.data.LibMatrixBincell;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
import org.apache.sysds.runtime.matrix.data.MatrixValue;
import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
import org.apache.sysds.runtime.matrix.operators.LeftScalarOperator;
import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
import org.apache.sysds.runtime.util.CommonThreadPool;

public final class CLALibBinaryCellOp {
    private static final Log LOG = LogFactory.getLog((String)CLALibBinaryCellOp.class.getName());

    private CLALibBinaryCellOp() {
    }

    public static MatrixBlock binaryOperationsRight(BinaryOperator op, CompressedMatrixBlock m1, MatrixBlock that, MatrixBlock result) {
        if (that.getNumRows() == 1 && that.getNumColumns() == 1) {
            RightScalarOperator sop = new RightScalarOperator(op.fn, that.getValue(0, 0), op.getNumThreads());
            return CLALibScalar.scalarOperations(sop, m1, result);
        }
        if (that.isEmpty()) {
            return CLALibBinaryCellOp.binaryOperationsEmpty(op, m1, that, result);
        }
        return CLALibBinaryCellOp.binaryOperationsRightFiltered(op, m1, that, result);
    }

    private static MatrixBlock binaryOperationsRightFiltered(BinaryOperator op, CompressedMatrixBlock m1, MatrixBlock that, MatrixBlock result) {
        LibMatrixBincell.isValidDimensionsBinaryExtended(m1, that);
        LibMatrixBincell.BinaryAccessType atype = LibMatrixBincell.getBinaryAccessTypeExtended(m1, that);
        if (that instanceof CompressedMatrixBlock && that.getInMemorySize() < m1.getInMemorySize()) {
            MatrixBlock m1uc = CompressedMatrixBlock.getUncompressed(m1, "Decompressing left side in BinaryOps");
            return CLALibBinaryCellOp.selectProcessingBasedOnAccessType(op, (CompressedMatrixBlock)that, m1uc, result, atype, true);
        }
        that = CompressedMatrixBlock.getUncompressed(that, "Decompressing right side in BinaryOps");
        return CLALibBinaryCellOp.selectProcessingBasedOnAccessType(op, m1, that, result, atype, false);
    }

    public static MatrixBlock binaryOperationsLeft(BinaryOperator op, CompressedMatrixBlock m1, MatrixBlock that, MatrixBlock result) {
        if (that.getNumRows() == 1 && that.getNumColumns() == 1) {
            LeftScalarOperator sop = new LeftScalarOperator(op.fn, that.getValue(0, 0), op.getNumThreads());
            return CLALibScalar.scalarOperations(sop, m1, result);
        }
        if (that.isEmpty()) {
            throw new NotImplementedException("Not handling left empty yet");
        }
        that = CompressedMatrixBlock.getUncompressed(that, "Decompressing left side in BinaryOps");
        LibMatrixBincell.isValidDimensionsBinaryExtended(that, m1);
        LibMatrixBincell.BinaryAccessType atype = LibMatrixBincell.getBinaryAccessTypeExtended(that, m1);
        return CLALibBinaryCellOp.selectProcessingBasedOnAccessType(op, m1, that, result, atype, true);
    }

    private static MatrixBlock binaryOperationsEmpty(BinaryOperator op, CompressedMatrixBlock m1, MatrixBlock that, MatrixBlock result) {
        int m1Col = m1.getNumColumns();
        int m1Row = m1.getNumRows();
        ValueFunction fn = op.fn;
        if (fn instanceof Multiply) {
            return CompressedMatrixBlockFactory.createConstant(m1Row, m1Col, 0.0);
        }
        if (fn instanceof Minus1Multiply) {
            return CompressedMatrixBlockFactory.createConstant(m1Row, m1Col, 1.0);
        }
        if (fn instanceof Minus || fn instanceof Plus || fn instanceof MinusMultiply || fn instanceof PlusMultiply) {
            CompressedMatrixBlock ret = new CompressedMatrixBlock();
            ret.copy(m1);
            return ret;
        }
        return CLALibBinaryCellOp.binaryOperationsRightFiltered(op, m1, that, result);
    }

    private static MatrixBlock selectProcessingBasedOnAccessType(BinaryOperator op, CompressedMatrixBlock m1, MatrixBlock that, MatrixBlock result, LibMatrixBincell.BinaryAccessType atype, boolean left) {
        if (atype == LibMatrixBincell.BinaryAccessType.MATRIX_COL_VECTOR || atype == LibMatrixBincell.BinaryAccessType.COL_VECTOR_MATRIX) {
            MatrixBlock d_compressed = m1.getCachedDecompressed();
            if (d_compressed != null) {
                if (left && atype == LibMatrixBincell.BinaryAccessType.COL_VECTOR_MATRIX) {
                    throw new NotImplementedException("Binary row op left is not supported for Uncompressed Matrix, Implement support for VMr in MatrixBlock Binary Cell operations");
                }
                if (left) {
                    return that.binaryOperations(op, d_compressed);
                }
                return d_compressed.binaryOperations(op, that);
            }
            return CLALibBinaryCellOp.binaryMVCol(m1, that, op, left);
        }
        if (atype == LibMatrixBincell.BinaryAccessType.MATRIX_MATRIX) {
            MatrixBlock d_compressed = m1.getCachedDecompressed();
            if (d_compressed != null) {
                if (left) {
                    return that.binaryOperations(op, d_compressed);
                }
                return d_compressed.binaryOperations(op, that);
            }
            return CLALibBinaryCellOp.binaryMM(m1, that, op, left);
        }
        if (CLALibBinaryCellOp.isSupportedBinaryCellOp(op.fn) && atype == LibMatrixBincell.BinaryAccessType.MATRIX_ROW_VECTOR || atype == LibMatrixBincell.BinaryAccessType.ROW_VECTOR_MATRIX) {
            return CLALibBinaryCellOp.rowBinCellOp(m1, that, result, op, left);
        }
        return CompressedMatrixBlock.getUncompressed(m1, "BinaryOp: " + op.fn).binaryOperations(op, that, result);
    }

    private static boolean isSupportedBinaryCellOp(ValueFunction fn) {
        return fn instanceof Multiply || fn instanceof Divide || fn instanceof Plus || fn instanceof Minus || fn instanceof MinusMultiply || fn instanceof PlusMultiply;
    }

    private static CompressedMatrixBlock setupCompressedReturnMatrixBlock(CompressedMatrixBlock m1, MatrixValue result) {
        CompressedMatrixBlock ret = null;
        if (result == null || !(result instanceof CompressedMatrixBlock)) {
            ret = new CompressedMatrixBlock(m1.getNumRows(), m1.getNumColumns());
        } else {
            ret = (CompressedMatrixBlock)result;
            ret.reset(m1.getNumRows(), m1.getNumColumns());
        }
        return ret;
    }

    private static MatrixBlock rowBinCellOp(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op, boolean left) {
        CompressedMatrixBlock cRet = CLALibBinaryCellOp.setupCompressedReturnMatrixBlock(m1, ret);
        if (CLALibBinaryCellOp.isValidForOverlappingBinaryCellOperations(m1, op)) {
            CLALibBinaryCellOp.overlappingBinaryCellOp(m1, m2, cRet, op, left);
        } else {
            CLALibBinaryCellOp.nonOverlappingBinaryCellOp(m1, m2, cRet, op, left);
        }
        cRet.recomputeNonZeros();
        return cRet;
    }

    private static void nonOverlappingBinaryCellOp(CompressedMatrixBlock m1, MatrixBlock m2, CompressedMatrixBlock ret, BinaryOperator op, boolean left) {
        LibMatrixBincell.BinaryAccessType atype = LibMatrixBincell.getBinaryAccessType(m1, m2);
        switch (atype) {
            case MATRIX_ROW_VECTOR: {
                CLALibBinaryCellOp.binaryMVRow(m1, m2, ret, op, left);
                return;
            }
        }
        LOG.warn((Object)("Inefficient Decompression for " + op + "  " + atype));
        m1.decompress().binaryOperations(op, m2, ret);
    }

    private static boolean isValidForOverlappingBinaryCellOperations(CompressedMatrixBlock m1, BinaryOperator op) {
        return m1.isOverlapping() && (op.fn instanceof Plus || op.fn instanceof Minus);
    }

    private static void overlappingBinaryCellOp(CompressedMatrixBlock m1, MatrixBlock m2, CompressedMatrixBlock ret, BinaryOperator op, boolean left) {
        CLALibBinaryCellOp.binaryMVPlusStack(m1, m2, ret, op, left);
    }

    private static CompressedMatrixBlock binaryMVRow(CompressedMatrixBlock m1, double[] v, CompressedMatrixBlock ret, BinaryOperator op, boolean left) {
        boolean isRowSafe;
        List<AColGroup> oldColGroups = m1.getColGroups();
        int k = op.getNumThreads();
        ArrayList<AColGroup> newColGroups = new ArrayList<AColGroup>(oldColGroups.size());
        boolean bl = isRowSafe = left ? op.isRowSafeLeft(v) : op.isRowSafeRight(v);
        if (k <= 1 || oldColGroups.size() <= 1) {
            CLALibBinaryCellOp.binaryMVRowSingleThread(oldColGroups, v, op, left, newColGroups, isRowSafe);
        } else {
            CLALibBinaryCellOp.binaryMVRowMultiThread(oldColGroups, v, op, left, newColGroups, isRowSafe, k);
        }
        ret.allocateColGroupList(newColGroups);
        ret.setNonZeros(m1.getNumColumns() * m1.getNumRows());
        return ret;
    }

    private static void binaryMVRowSingleThread(List<AColGroup> oldColGroups, double[] v, BinaryOperator op, boolean left, List<AColGroup> newColGroups, boolean isRowSafe) {
        if (left) {
            for (AColGroup grp : oldColGroups) {
                newColGroups.add(grp.binaryRowOpLeft(op, v, isRowSafe));
            }
        } else {
            for (AColGroup grp : oldColGroups) {
                newColGroups.add(grp.binaryRowOpRight(op, v, isRowSafe));
            }
        }
    }

    private static void binaryMVRowMultiThread(List<AColGroup> oldColGroups, double[] v, BinaryOperator op, boolean left, List<AColGroup> newColGroups, boolean isRowSafe, int k) {
        ExecutorService pool = CommonThreadPool.get(k);
        ArrayList<BinaryMVRowTask> tasks = new ArrayList<BinaryMVRowTask>();
        try {
            if (left) {
                for (AColGroup aColGroup : oldColGroups) {
                    tasks.add(new BinaryMVRowTaskLeft(aColGroup, v, op, isRowSafe));
                }
            } else {
                for (AColGroup aColGroup : oldColGroups) {
                    tasks.add(new BinaryMVRowTaskRight(aColGroup, v, op, isRowSafe));
                }
            }
            for (Future future : pool.invokeAll(tasks)) {
                newColGroups.add((AColGroup)future.get());
            }
        }
        catch (InterruptedException | ExecutionException e) {
            pool.shutdown();
            throw new DMLRuntimeException(e);
        }
        pool.shutdown();
    }

    private static CompressedMatrixBlock binaryMVRow(CompressedMatrixBlock m1, MatrixBlock m2, CompressedMatrixBlock ret, BinaryOperator op, boolean left) {
        return CLALibBinaryCellOp.binaryMVRow(m1, CLALibBinaryCellOp.forceMatrixBlockToDense(m2), ret, op, left);
    }

    private static double[] forceMatrixBlockToDense(MatrixBlock m2) {
        double[] v;
        if (m2.isInSparseFormat()) {
            SparseBlock sb = m2.getSparseBlock();
            if (sb == null) {
                throw new DMLRuntimeException("Unknown matrix block type");
            }
            double[] spV = sb.values(0);
            int[] spI = sb.indexes(0);
            v = new double[m2.getNumColumns()];
            for (int i = sb.pos(0); i < sb.size(0); ++i) {
                v[spI[i]] = spV[i];
            }
        } else {
            v = m2.getDenseBlockValues();
        }
        return v;
    }

    protected static CompressedMatrixBlock binaryMVPlusStack(CompressedMatrixBlock m1, MatrixBlock m2, CompressedMatrixBlock ret, BinaryOperator op, boolean left) {
        AColGroup g;
        List<AColGroup> oldColGroups = m1.getColGroups();
        int size = oldColGroups.size();
        ArrayList<AColGroup> newColGroups = new ArrayList<AColGroup>(size);
        int nCol = m1.getNumColumns();
        int smallestIndex = 0;
        int smallestSize = Integer.MAX_VALUE;
        for (int i = 0; i < size; ++i) {
            g = oldColGroups.get(i);
            int newSize = g.getNumValues();
            newColGroups.add(g);
            if (newSize >= smallestSize || g.getNumCols() != nCol || g instanceof ASDCZero) continue;
            smallestIndex = i;
            smallestSize = newSize;
        }
        if (smallestSize == Integer.MAX_VALUE) {
            MatrixBlockDictionary newDict = MatrixBlockDictionary.create(m2);
            if (newDict != null) {
                newColGroups.add(ColGroupConst.create(nCol, (IDictionary)newDict));
            }
        } else {
            double[] row = m2.getDenseBlockValues();
            g = left ? ((AColGroup)newColGroups.get(smallestIndex)).binaryRowOpLeft(op, row, op.isRowSafeLeft(row)) : ((AColGroup)newColGroups.get(smallestIndex)).binaryRowOpRight(op, row, op.isRowSafeRight(row));
            newColGroups.set(smallestIndex, g);
        }
        ret.allocateColGroupList(newColGroups);
        ret.setOverlapping(true);
        ret.setNonZeros(-1L);
        return ret;
    }

    private static MatrixBlock binaryMVCol(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left) {
        int nCols = m1.getNumColumns();
        int nRows = m1.getNumRows();
        m1 = CLALibBinaryCellOp.morph(m1);
        MatrixBlock ret = new MatrixBlock(nRows, nCols, false, -1L).allocateBlock();
        int k = op.getNumThreads();
        long nnz = 0L;
        nnz = k <= 1 ? CLALibBinaryCellOp.binaryMVColSingleThread(m1, m2, op, left, ret) : CLALibBinaryCellOp.binaryMVColMultiThread(m1, m2, op, left, ret);
        if (op.fn instanceof ValueComparisonFunction) {
            if (nnz == (long)nRows * (long)nCols) {
                return CompressedMatrixBlockFactory.createConstant(nRows, nCols, 1.0);
            }
            if (nnz == 0L) {
                return CompressedMatrixBlockFactory.createConstant(nRows, nCols, 0.0);
            }
        }
        ret.setNonZeros(nnz);
        ret.examSparsity();
        return ret;
    }

    private static long binaryMVColSingleThread(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left, MatrixBlock ret) {
        int nRows = m1.getNumRows();
        long nnz = 0L;
        nnz = left ? (nnz += (long)new BinaryMVColLeftTask(m1, m2, ret, 0, nRows, op).call().intValue()) : (nnz += (long)new BinaryMVColTask(m1, m2, ret, 0, nRows, op).call().intValue());
        return nnz;
    }

    private static long binaryMVColMultiThread(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left, MatrixBlock ret) {
        int nRows = m1.getNumRows();
        int k = op.getNumThreads();
        int blkz = ret.getNumRows() / k;
        long nnz = 0L;
        ExecutorService pool = CommonThreadPool.get(op.getNumThreads());
        ArrayList<Callable<Integer>> tasks = new ArrayList<Callable<Integer>>();
        try {
            for (int i = 0; i < nRows; i += blkz) {
                if (left) {
                    tasks.add(new BinaryMVColLeftTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op));
                    continue;
                }
                tasks.add(new BinaryMVColTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op));
            }
            for (Future f : pool.invokeAll(tasks)) {
                nnz += (long)((Integer)f.get()).intValue();
            }
            pool.shutdown();
        }
        catch (InterruptedException | ExecutionException e) {
            throw new DMLRuntimeException(e);
        }
        return nnz;
    }

    private static MatrixBlock binaryMM(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left) {
        int nCols = m1.getNumColumns();
        int nRows = m1.getNumRows();
        m1 = CLALibBinaryCellOp.morph(m1);
        MatrixBlock ret = new MatrixBlock(nRows, nCols, false, -1L).allocateBlock();
        long nnz = CLALibBinaryCellOp.binaryMMMultiThread(m1, m2, op, left, ret);
        ret.setNonZeros(nnz);
        ret.examSparsity();
        return ret;
    }

    private static long binaryMMMultiThread(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left, MatrixBlock ret) {
        int nRows = m1.getNumRows();
        int k = op.getNumThreads();
        int blkz = ret.getNumRows() / k;
        long nnz = 0L;
        ExecutorService pool = CommonThreadPool.get(op.getNumThreads());
        ArrayList<BinaryMMTask> tasks = new ArrayList<BinaryMMTask>();
        try {
            for (int i = 0; i < nRows; i += blkz) {
                tasks.add(new BinaryMMTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op, left));
            }
            for (Future f : pool.invokeAll(tasks)) {
                nnz += ((Long)f.get()).longValue();
            }
            pool.shutdown();
        }
        catch (InterruptedException | ExecutionException e) {
            throw new DMLRuntimeException(e);
        }
        return nnz;
    }

    private static CompressedMatrixBlock morph(CompressedMatrixBlock m) {
        List<AColGroup> groups = m.getColGroups();
        boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
        if (shouldFilter) {
            CompressedMatrixBlock mf1 = new CompressedMatrixBlock(m);
            int nCols = m.getNumColumns();
            double[] constV = new double[nCols];
            List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
            filteredGroups.add(ColGroupConst.create(constV));
            mf1.allocateColGroupList(filteredGroups);
            return mf1;
        }
        return m;
    }

    protected static void decompressToSubBlock(int rl, int ru, DenseBlock db, List<AColGroup> groups, AIterator[] its) {
        for (int i = 0; i < groups.size(); ++i) {
            AColGroup g = groups.get(i);
            if (g.getCompType() == AColGroup.CompressionType.SDC) {
                ((ASDCZero)g).decompressToDenseBlock(db, rl, ru, 0, 0, its[i]);
                continue;
            }
            g.decompressToDenseBlock(db, rl, ru, 0, 0);
        }
    }

    protected static AIterator[] getIterators(List<AColGroup> groups, int rl) {
        AIterator[] its = new AIterator[groups.size()];
        for (int i = 0; i < groups.size(); ++i) {
            AColGroup g = groups.get(i);
            if (g.getCompType() != AColGroup.CompressionType.SDC) continue;
            its[i] = ((ASDCZero)g).getIterator(rl);
        }
        return its;
    }

    private static class BinaryMVRowTaskRight
    extends BinaryMVRowTask {
        protected BinaryMVRowTaskRight(AColGroup group, double[] v, BinaryOperator op, boolean isRowSafe) {
            super(group, v, op, isRowSafe);
        }

        @Override
        public AColGroup call() {
            return this._group.binaryRowOpRight(this._op, this._v, this._isRowSafe);
        }
    }

    private static class BinaryMVRowTaskLeft
    extends BinaryMVRowTask {
        protected BinaryMVRowTaskLeft(AColGroup group, double[] v, BinaryOperator op, boolean isRowSafe) {
            super(group, v, op, isRowSafe);
        }

        @Override
        public AColGroup call() {
            return this._group.binaryRowOpLeft(this._op, this._v, this._isRowSafe);
        }
    }

    private static abstract class BinaryMVRowTask
    implements Callable<AColGroup> {
        protected final AColGroup _group;
        protected final double[] _v;
        protected final BinaryOperator _op;
        protected final boolean _isRowSafe;

        protected BinaryMVRowTask(AColGroup group, double[] v, BinaryOperator op, boolean isRowSafe) {
            this._group = group;
            this._v = v;
            this._op = op;
            this._isRowSafe = isRowSafe;
        }
    }

    private static class BinaryMVColLeftTask
    implements Callable<Integer> {
        private final int _rl;
        private final int _ru;
        private final CompressedMatrixBlock _m1;
        private final MatrixBlock _m2;
        private final MatrixBlock _ret;
        private final BinaryOperator _op;

        protected BinaryMVColLeftTask(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, BinaryOperator op) {
            this._m1 = m1;
            this._m2 = m2;
            this._ret = ret;
            this._op = op;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Integer call() {
            for (AColGroup g : this._m1.getColGroups()) {
                g.decompressToDenseBlock(this._ret.getDenseBlock(), this._rl, this._ru);
            }
            if (this._m2.isInSparseFormat()) {
                throw new NotImplementedException("Not Implemented sparse Format execution for MM.");
            }
            int offset = this._rl * this._m1.getNumColumns();
            double[] _retDense = this._ret.getDenseBlockValues();
            double[] _m2Dense = this._m2.getDenseBlockValues();
            for (int row = this._rl; row < this._ru; ++row) {
                double vr = _m2Dense[row];
                for (int col = 0; col < this._m1.getNumColumns(); ++col) {
                    double v;
                    _retDense[offset] = v = this._op.fn.execute(vr, _retDense[offset]);
                    ++offset;
                }
            }
            return this._ret.getNumColumns() * this._ret.getNumRows();
        }
    }

    private static class BinaryMMTask
    implements Callable<Long> {
        private final int _rl;
        private final int _ru;
        private final CompressedMatrixBlock _m1;
        private final MatrixBlock _m2;
        private final MatrixBlock _ret;
        private final boolean _left;
        private final BinaryOperator _op;

        protected BinaryMMTask(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, BinaryOperator op, boolean left) {
            this._m1 = m1;
            this._m2 = m2;
            this._ret = ret;
            this._op = op;
            this._rl = rl;
            this._ru = ru;
            this._left = left;
        }

        @Override
        public Long call() {
            List<AColGroup> groups = this._m1.getColGroups();
            int _blklen = Math.max(16384 / this._ret.getNumColumns() / groups.size(), 64);
            AIterator[] its = CLALibBinaryCellOp.getIterators(groups, this._rl);
            long nnz = 0L;
            for (int r = this._rl; r < this._ru; r += _blklen) {
                int re = Math.min(r + _blklen, this._ru);
                this.processBlock(r, re, groups, its);
                nnz += this._ret.recomputeNonZeros(r, re - 1);
            }
            return nnz;
        }

        private final void processBlock(int rl, int ru, List<AColGroup> groups, AIterator[] its) {
            DenseBlock db = this._ret.getDenseBlock();
            CLALibBinaryCellOp.decompressToSubBlock(rl, ru, db, groups, its);
            if (this._left) {
                this.processLeft(rl, ru);
            } else {
                this.processRight(rl, ru);
            }
        }

        private final void processLeft(int rl, int ru) {
            if (this._m2.isInSparseFormat()) {
                this.processLeftSparse(rl, ru);
            } else {
                this.processLeftDense(rl, ru);
            }
        }

        private final void processLeftSparse(int rl, int ru) {
            DenseBlock rv = this._ret.getDenseBlock();
            int cols = this._ret.getNumColumns();
            SparseBlock m2sb = this._m2.getSparseBlock();
            for (int r = rl; r < ru; ++r) {
                double[] retV = rv.values(r);
                int off = rv.pos(r);
                if (m2sb.isEmpty(r)) {
                    for (int c = off; c < cols + off; ++c) {
                        retV[c] = this._op.fn.execute(retV[c], 0.0);
                    }
                    continue;
                }
                int apos = m2sb.pos(r);
                int alen = m2sb.size(r) + apos;
                int[] aix = m2sb.indexes(r);
                double[] avals = m2sb.values(r);
                int j = 0;
                int k = apos;
                while (j < cols && k < alen) {
                    double v = aix[k] == j ? avals[k++] : 0.0;
                    retV[off] = this._op.fn.execute(v, retV[off]);
                    ++j;
                    ++off;
                }
                while (j < cols) {
                    retV[off] = this._op.fn.execute(0.0, retV[off]);
                    ++j;
                }
            }
        }

        private final void processLeftDense(int rl, int ru) {
            DenseBlock rv = this._ret.getDenseBlock();
            int cols = this._ret.getNumColumns();
            DenseBlock m2db = this._m2.getDenseBlock();
            for (int r = rl; r < ru; ++r) {
                int off;
                double[] retV = rv.values(r);
                double[] m2V = m2db.values(r);
                for (int c = off = rv.pos(r); c < cols + off; ++c) {
                    retV[c] = this._op.fn.execute(m2V[c], retV[c]);
                }
            }
        }

        private final void processRight(int rl, int ru) {
            if (this._m2.isEmpty()) {
                this.processRightEmpty(rl, ru);
            } else if (this._m2.isInSparseFormat()) {
                this.processRightSparse(rl, ru);
            } else {
                this.processRightDense(rl, ru);
            }
        }

        private final void processRightSparse(int rl, int ru) {
            DenseBlock rv = this._ret.getDenseBlock();
            int cols = this._ret.getNumColumns();
            SparseBlock m2sb = this._m2.getSparseBlock();
            for (int r = rl; r < ru; ++r) {
                double[] retV = rv.values(r);
                int off = rv.pos(r);
                if (m2sb.isEmpty(r)) {
                    for (int c = off; c < cols + off; ++c) {
                        retV[c] = this._op.fn.execute(retV[c], 0.0);
                    }
                    continue;
                }
                int apos = m2sb.pos(r);
                int alen = m2sb.size(r) + apos;
                int[] aix = m2sb.indexes(r);
                double[] avals = m2sb.values(r);
                int j = 0;
                int k = apos;
                while (j < cols && k < alen) {
                    double v = aix[k] == j ? avals[k++] : 0.0;
                    retV[off] = this._op.fn.execute(retV[off], v);
                    ++j;
                    ++off;
                }
                while (j < cols) {
                    retV[off] = this._op.fn.execute(retV[off], 0.0);
                    ++j;
                }
            }
        }

        private final void processRightDense(int rl, int ru) {
            DenseBlock rv = this._ret.getDenseBlock();
            int cols = this._ret.getNumColumns();
            DenseBlock m2db = this._m2.getDenseBlock();
            for (int r = rl; r < ru; ++r) {
                int off;
                double[] retV = rv.values(r);
                double[] m2V = m2db.values(r);
                for (int c = off = rv.pos(r); c < cols + off; ++c) {
                    retV[c] = this._op.fn.execute(retV[c], m2V[c]);
                }
            }
        }

        private final void processRightEmpty(int rl, int ru) {
            DenseBlock rv = this._ret.getDenseBlock();
            int cols = this._ret.getNumColumns();
            for (int r = rl; r < ru; ++r) {
                int off;
                double[] retV = rv.values(r);
                for (int c = off = rv.pos(r); c < cols + off; ++c) {
                    retV[c] = this._op.fn.execute(retV[c], 0.0);
                }
            }
        }
    }

    private static class BinaryMVColTask
    implements Callable<Integer> {
        private final int _rl;
        private final int _ru;
        private final CompressedMatrixBlock _m1;
        private final MatrixBlock _m2;
        private final MatrixBlock _ret;
        private final BinaryOperator _op;

        protected BinaryMVColTask(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, BinaryOperator op) {
            this._m1 = m1;
            this._m2 = m2;
            this._ret = ret;
            this._op = op;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Integer call() {
            int _blklen = Math.max(16384 / this._ret.getNumColumns(), 64);
            List<AColGroup> groups = this._m1.getColGroups();
            AIterator[] its = CLALibBinaryCellOp.getIterators(groups, this._rl);
            for (int r = this._rl; r < this._ru; r += _blklen) {
                this.processBlock(r, Math.min(r + _blklen, this._ru), groups, its);
            }
            return this._ret.getNumColumns() * this._ret.getNumRows();
        }

        private final void processBlock(int rl, int ru, List<AColGroup> groups, AIterator[] its) {
            DenseBlock db = this._ret.getDenseBlock();
            CLALibBinaryCellOp.decompressToSubBlock(rl, ru, db, groups, its);
            if (this._m2.isInSparseFormat()) {
                throw new NotImplementedException("Not Implemented sparse Format execution for MM.");
            }
            this.processDense(rl, ru);
        }

        private final void processDense(int rl, int ru) {
            int offset = rl * this._m1.getNumColumns();
            double[] _retDense = this._ret.getDenseBlockValues();
            double[] _m2Dense = this._m2.getDenseBlockValues();
            for (int row = rl; row < ru; ++row) {
                double vr = _m2Dense[row];
                for (int col = 0; col < this._m1.getNumColumns(); ++col) {
                    _retDense[offset] = this._op.fn.execute(_retDense[offset], vr);
                    ++offset;
                }
            }
        }
    }
}

