From cf17553cf232a19ea1b520c4f26a9f35c4390c7c Mon Sep 17 00:00:00 2001 From: Vivek Date: Tue, 28 Jul 2020 08:27:02 -0400 Subject: [PATCH] Affine Loop Interchange Pass --- mlir/include/mlir/Dialect/Affine/Passes.h | 3 + mlir/include/mlir/Dialect/Affine/Passes.td | 9 + .../Transforms/AffineLoopInterchange.cpp | 922 ++++++++++++++++++ .../Dialect/Affine/Transforms/CMakeLists.txt | 1 + 4 files changed, 935 insertions(+) create mode 100644 mlir/lib/Dialect/Affine/Transforms/AffineLoopInterchange.cpp diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h index 18b3b790338d8..fce1975cc9f97 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.h +++ b/mlir/include/mlir/Dialect/Affine/Passes.h @@ -36,6 +36,9 @@ std::unique_ptr> createSimplifyAffineStructuresPass(); std::unique_ptr> createAffineLoopInvariantCodeMotionPass(); +/// Creates a loop interchange pass +std::unique_ptr> createAffineLoopInterchangePass(); + /// Creates a pass to convert all parallel affine.for's into 1-d affine.parallel /// ops. std::unique_ptr> createAffineParallelizePass(); diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td index 810640058155f..18f222abe8375 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.td +++ b/mlir/include/mlir/Dialect/Affine/Passes.td @@ -48,6 +48,15 @@ def AffineLoopInvariantCodeMotion let constructor = "mlir::createAffineLoopInvariantCodeMotionPass()"; } + +/* AffineLoopInterchangePassBase*/ +def AffineLoopInterchange + : FunctionPass<"affine-loop-interchange"> { + let summary = "Interchanges the loops to the optimal ordering possible"; + let constructor = "mlir::createAffineLoopInterchangePass()"; +} + + def AffineLoopTiling : FunctionPass<"affine-loop-tile"> { let summary = "Tile affine loop nests"; let constructor = "mlir::createLoopTilingPass()"; diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInterchange.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInterchange.cpp new file mode 100644 index 0000000000000..a0b1efa78211b --- /dev/null +++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInterchange.cpp @@ -0,0 +1,922 @@ +//####This pass does the loop interchange#####// +// This code makes the assumption that if the input program have a loop nest +// then that loop nest will contain atleast one array load or store operation + +#include "PassDetail.h" +#include "algorithm" +#include "bits/stdc++.h" +#include "iostream" +#include "mlir/Analysis/AffineAnalysis.h" +#include "mlir/Analysis/AffineStructures.h" +#include "mlir/Analysis/LoopAnalysis.h" +#include "mlir/Analysis/Utils.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Dialect/Affine/Passes.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/IR/BlockAndValueMapping.h" +#include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Utils.h" +#include "typeinfo" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace mlir; + +#define DEBUG_TYPE "affine-loop-interchange" + +namespace { + +class ValuePositionMap { +public: + void addSrcValue(Value value) { + if (addValueAt(value, &srcDimPosMap, numSrcDims)) + ++numSrcDims; + } + void addSymbolValue(Value value) { + if (addValueAt(value, &symbolPosMap, numSymbols)) + ++numSymbols; + } + unsigned getSrcDimOrSymPos(Value value) const { + return getDimOrSymPos(value, srcDimPosMap, 0); + } + unsigned getSymPos(Value value) const { + auto it = symbolPosMap.find(value); + assert(it != symbolPosMap.end()); + return numSrcDims + it->second; + } + + unsigned getNumSrcDims() const { return numSrcDims; } + unsigned getNumDims() const { return numSrcDims; } + unsigned getNumSymbols() const { return numSymbols; } + +private: + bool addValueAt(Value value, DenseMap *posMap, + unsigned position) { + auto it = posMap->find(value); + if (it == posMap->end()) { + (*posMap)[value] = position; + return true; + } + return false; + } + unsigned getDimOrSymPos(Value value, + const DenseMap &dimPosMap, + unsigned dimPosOffset) const { + auto it = dimPosMap.find(value); + if (it != dimPosMap.end()) { + return dimPosOffset + it->second; + } + it = symbolPosMap.find(value); + assert(it != symbolPosMap.end()); + return numSrcDims + it->second; + } + + unsigned numSrcDims = 0; + unsigned numSymbols = 0; + DenseMap srcDimPosMap; + DenseMap symbolPosMap; +}; + +// Loop Interchange Pass +struct LoopInterchange : public AffineLoopInterchangeBase { + SmallVector loadsAndStores; + // A vector for storing operation type i.e. load or store + std::vector oper_type; + // A 3d vector used for storing access matrix + std::vector>> accessmat; + // A 2d vector for storing dependences + std::vector> dependencemat; + // This vector contains the rank of all access matrices of Operations + // in the order of their apperance in the program + std::vector rank_accmat; + // This variable contains the rank of dependence Matrix + int rank_depmat; + // Variables for access AccessMatrix + unsigned page, rows, col; + // This vector contains all valid loop permutations + std::vector> validperms; + // Variable to tell total no. of valid permutations + int num_vp; + // this vector contains the indexes of loops which can be parallelized + std::vector par_loops; + // Vector to contain groups formed + std::vector> final_groups; + // contains index of operations with spatial locality + std::set sl_ind; + // contains index of operations with temporal locality + std::set tl_ind; + // variable for maintaing dependencemat index + int dm_index; + + void runOnFunction() override; + void find_dep_access(AffineForOp floop); + void createAccessMatrix(ArrayRef loadsAndStores); + void checkDependences(ArrayRef loadsAndStores); + std::string + getDirectionVectorStr(bool ret, unsigned numCommonLoops, + unsigned loopNestDepth, + ArrayRef dependenceComponents); + void valid_perm(); + void checkvalidperm(int ar[], int loop_dep); + int best_perm(); + std::vector cost_model(std::vector cperm); + int spatial_score(std::vector perm_ss); + int temporal_score(std::vector perm_ts); + int parallelism_score(std::vector perm_ps); + void find_parallel_loops(); + int group_reuse(); + bool compare_accmat(int i, int j); + void form_group(); +}; +} // end of namespace + +// checks if t1 is a proper subset(equal as well) of t2 or not +bool isSubset(std::set t1, std::set t2) { + if (t1.size() >= t2.size()) + return false; + bool flag = true; + std::set::iterator it1; + for (it1 = t1.begin(); it1 != t1.end(); it1++) { + if (t2.find(*it1) == t2.end()) { + flag = false; + break; + } + } + return flag; +} + +bool LoopInterchange::compare_accmat(int i, int j) { + int size_i = (accessmat[i].size()) * (accessmat[i][0].size()); + int size_j = (accessmat[j].size()) * (accessmat[j][0].size()); + if (size_i != size_j) + return false; + bool flag = true; + for (int p = 0, e = accessmat[i].size(); p < e; p++) { + for (int q = 0, k = accessmat[i][p].size(); q < k; q++) { + if (accessmat[i][p][q] != accessmat[j][p][q]) { + flag = false; + break; + } + } + if (!flag) + break; + } + return flag; +} + +// This function forms the groups for a loop nest in the input program +void LoopInterchange::form_group() { + final_groups.clear(); + std::vector> groups; + for (int i = 0, e = accessmat.size(); i < e; i++) { + for (int j = 0, f = e; j < f; j++) { + bool eq_accmat = compare_accmat(i, j); + auto *srcOpInst = loadsAndStores[i]; + MemRefAccess srcAccess(srcOpInst); + auto *dstOpInst = loadsAndStores[j]; + MemRefAccess dstAccess(dstOpInst); + unsigned numCommonLoops = + getNumCommonSurroundingLoops(*srcOpInst, *dstOpInst); + bool dep_flag = false; + for (unsigned d = 1; d <= numCommonLoops + 1; ++d) { + FlatAffineConstraints dependenceConstraints; + SmallVector dependenceComponents; + DependenceResult result = checkMemrefAccessDependence( + srcAccess, dstAccess, d, &dependenceConstraints, + &dependenceComponents, true); + if (hasDependence(result)) { + dep_flag = true; + break; + } + } + if (eq_accmat && dep_flag) { // true if ith access matrix is equal to jth + // if there's a dependence b/w i & j + bool in_flag = false; + for (int p = 0, g = groups.size(); p < g; p++) { + if (groups[p].find(i) != groups[p].end() || + groups[p].find(j) != groups[p].end()) { + groups[p].insert(i); + groups[p].insert(j); + in_flag = true; + break; + } + } + if (!in_flag) { + std::set t; + t.insert(i); + t.insert(j); + groups.push_back(t); + } + } else { + bool out_f1 = false; + bool out_f2 = false; + for (int p = 0, g = groups.size(); p < g; p++) { + if (groups[p].find(i) != groups[p].end()) { + out_f1 = true; + } + if (groups[p].find(j) != groups[p].end()) { + out_f2 = true; + } + } + if (!out_f1) { + std::set t1; + t1.insert(i); + groups.push_back(t1); + } + + if (!out_f2) { + std::set t2; + t2.insert(j); + groups.push_back(t2); + } + } + } + } + // eliminating redundancies + for (int i = 0, e = groups.size(); i < e; i++) { + bool sub = false; + for (int j = 0, f = groups.size(); j < f; j++) { + if (isSubset(groups[i], groups[j])) { + sub = true; + break; + } + } + if (!sub) + final_groups.push_back(groups[i]); + } +} + +// This function finds out the loops that can be parallelized and stores them +// vector par_loops +void LoopInterchange::find_parallel_loops() { + par_loops.clear(); + std::set seq_loops; + int n_loops = (int)accessmat[0][0].size(); + for (int i = 0, e = dependencemat.size(); i < e; i++) { + for (int j = 0, f = dependencemat[i].size(); j < f; j++) { + if (dependencemat[i][j] > 0) { + seq_loops.insert(j); + break; + } + } + } + for (int i = 0; i < n_loops; i++) { + if (seq_loops.find(i) == seq_loops.end()) { + par_loops.push_back(i); + } + } +} + +// This function computes the group reuse scores +int LoopInterchange::group_reuse() { + int ts = 0; + for (int i = 0, e = final_groups.size(); i < e; i++) { + if (final_groups[i].size() > 1) { + // enter here only if the group size is greater than one + // take a representative element from a group let's say 1st element always + auto it = final_groups[i].begin(); + int rep_ele = *it; + int temp_sc = 0; + int spat_sc = 0; + if (tl_ind.find(rep_ele) != tl_ind.end()) + temp_sc = final_groups[i].size(); + if (sl_ind.find(rep_ele) != sl_ind.end()) + spat_sc = final_groups[i].size(); + ts += temp_sc + spat_sc; + } + } + return ts; +} + +// This function computes the sync-free parallelism score for the given +// permutaion +int LoopInterchange::parallelism_score(std::vector perm_ps) { + // total score for all accesses + int ts = 0; + // Check if there are loops that can be parallelized + if (par_loops.size() > 0) { + for (int i = 0, e = (int)perm_ps.size(); i < e; i++) { + if (std::find(par_loops.begin(), par_loops.end(), perm_ps[i]) != + par_loops.end()) { + // true if loop at index i in perm_ps is parallel + ts = e - i; + } else { + if (ts == 0) + continue; + break; + } + } + } + return ts; +} + +// This function computes the temporal locality score for the given permutaion +int LoopInterchange::temporal_score(std::vector perm_ts) { + tl_ind.clear(); + // total score for all accesses + int ts = 0; + int cols = perm_ts.size(); + for (int i = 0, e = rank_accmat.size(); i < e; i++) { + int max_tempLoc = cols - rank_accmat[i]; + for (int j = 1; j <= max_tempLoc; j++) { + bool flag = true; + for (int k = 0, f = accessmat[i].size(); k < f; k++) { + if (accessmat[i][k][perm_ts[cols - j]] != 0) { + flag = false; + break; + } + } + if (flag) { + tl_ind.insert(i); + ts += 1; + } + } + } + return ts; +} + +// This function computes the spatial locality score for the given permutaion +int LoopInterchange::spatial_score(std::vector perm_ss) { + sl_ind.clear(); + // total score for all accesses + int ts = 0; + // compute the spatial_score for each access matrix + for (int i = 0, e = accessmat.size(); i < e; i++) { + // fvd is fastest varying dimension + int size = perm_ss.size() - 1; + int fvd = perm_ss[size]; + int index = accessmat[i].size(); + do { + for (int j = 0, f = accessmat[i].size(); j < f; j++) { + if (accessmat[i][j][fvd] > 0) { + index = j; + break; + } + } + if (index == (int)accessmat[i].size()) { // it means entire column is 0 + size -= 1; + fvd = perm_ss[size]; + continue; + } else { // it means a non-zero element in the fvd column + for (int r = size; r < (int)perm_ss.size(); r++) { + if (index == ((int)accessmat[i].size()) - 1) { + sl_ind.insert(i); + ts += 1; + break; + } + } + break; + } + } while (true); + } + return ts; +} + +// This function return the cost i.e. score for a valid permutation +std::vector LoopInterchange::cost_model(std::vector cperm) { + std::vector score; + score.push_back(spatial_score(cperm)); + score.push_back(temporal_score(cperm)); + score.push_back(parallelism_score(cperm)); + score.push_back(group_reuse()); + return score; +} + +// This function finds the best loop permutation out of all valid permutations +// and returns the index of best permutation +int LoopInterchange::best_perm() { + // This function finds the loops that can be parallelized i.e. loops that + // don't carry dependences and stores them into vector par_loops + find_parallel_loops(); + + // this function will form the groups and store them in vector final_groups + form_group(); + + // computing the cost for each valid permutation and storing them in the + // 2d vector cost in the order of spatial,temporal and parallelism score resp. + std::vector> cost; + for (int i = 0; i < num_vp; i++) { + std::vector bperm; + for (int j = 0, c = validperms[i].size(); j < c; j++) { + bperm.push_back(validperms[i][j]); + } + cost.push_back(cost_model(bperm)); + } + + int tempScore, bestScore = -100, bestIndex = 0; + // finding the index of best loop permutation + for (int i = 0, e = cost.size(); i < e; i++) { + tempScore = cost[i][0] + cost[i][1] + cost[i][2]; + if (tempScore > bestScore) { + bestScore = tempScore; + bestIndex = i; + } else if (tempScore == bestScore) { + if (cost[i][0] + cost[i][1] > cost[bestIndex][0] + cost[bestIndex][1]) { + // breaking the clash based on locality + bestIndex = i; + } else if (cost[i][0] + cost[i][1] == + cost[bestIndex][0] + cost[bestIndex][1]) { + // breaking the clash based on group reuse + if (cost[i][3] > cost[bestIndex][3]) + bestIndex = i; + } + } + } + return bestIndex; +} + +// This function checks whether the given permutation is valid or not +void LoopInterchange::checkvalidperm(int ar[], int loop_dep) { + bool flag = true; + // check if there's atleast 1 dependence + if (dependencemat.size() > 0) { + for (int i = 0, e = dependencemat.size(); i < e; i++) { + int sum = 0; + for (int j = 0; j < loop_dep; j++) { + sum += dependencemat[i][ar[j]]; + if (sum != 0) { + if (sum < 0) { + flag = false; + } + break; + } + } + if (!flag) + break; + } + } + if (flag) { + std::vector vp2; + validperms.push_back(vp2); + for (int k = 0; k < loop_dep; k++) + validperms[num_vp].push_back(ar[k]); + num_vp++; + } +} + +// This function generates all valid permutations of loop nest +void LoopInterchange::valid_perm() { + num_vp = 0; + validperms.clear(); + int num_loop = accessmat[0][0].size(); + int *loopiv{new int[num_loop]{}}; + for (int i = 0; i < num_loop; i++) { + loopiv[i] = i; + } + do { + checkvalidperm(loopiv, num_loop); + } while (std::next_permutation(loopiv, loopiv + num_loop)); +} + +// This function is called by checkDependences function +std::string LoopInterchange::getDirectionVectorStr( + bool ret, unsigned numCommonLoops, unsigned loopNestDepth, + ArrayRef dependenceComponents) { + if (!ret) + return "false"; + if (dependenceComponents.empty() || loopNestDepth > numCommonLoops) + return "true"; + std::vector d2; + dependencemat.push_back(d2); + std::string result; + for (unsigned i = 0, e = dependenceComponents.size(); i < e; ++i) { + int64_t lowerb = 0, temp = 0; + if (dependenceComponents[i].lb.hasValue() && + dependenceComponents[i].lb.getValue() != + std::numeric_limits::min()) + lowerb = dependenceComponents[i].lb.getValue(); + + int64_t upperb = 0; + if (dependenceComponents[i].ub.hasValue() && + dependenceComponents[i].ub.getValue() != + std::numeric_limits::max()) { + upperb = dependenceComponents[i].ub.getValue(); + } + + if (lowerb == upperb) { + temp = (lowerb); + } else if (lowerb < 0 && upperb < 0) { + temp = (upperb); + } else if (lowerb > 0 && upperb > 0) { + temp = (lowerb); + } else if (lowerb == 0 && upperb > 0) { + temp = (lowerb + 1); + } else if (upperb == 0 && lowerb < 0) { + temp = (upperb - 1); + } else if (lowerb < 0 && upperb > 0) { + temp = -1; + } + + dependencemat[dm_index].push_back(temp); + result += "[" + std::to_string(temp) + "]"; + } + return result; +} + +// This function check for dependences present inside the loop +void LoopInterchange::checkDependences(ArrayRef loadsAndStores) { + dm_index = -1; + for (unsigned i = 0, e = loadsAndStores.size(); i < e; ++i) { + auto *srcOpInst = loadsAndStores[i]; + MemRefAccess srcAccess(srcOpInst); + for (unsigned j = 0; j < e; ++j) { + auto *dstOpInst = loadsAndStores[j]; + MemRefAccess dstAccess(dstOpInst); + + unsigned numCommonLoops = + getNumCommonSurroundingLoops(*srcOpInst, *dstOpInst); + for (unsigned d = 1; d <= numCommonLoops; ++d) { + FlatAffineConstraints dependenceConstraints; + SmallVector dependenceComponents; + DependenceResult result = checkMemrefAccessDependence( + srcAccess, dstAccess, d, &dependenceConstraints, + &dependenceComponents); + + bool ret = hasDependence(result); + if (ret) { // This means dependence exists + dm_index++; + std::string dep_result = getDirectionVectorStr(ret, numCommonLoops, d, + dependenceComponents); + break; + } + } + } + } +} + +// The code from here uptill next 115 lines is taken from the AffineAnalysis.cpp file +static LogicalResult +addMemRefAccessConstraints(const AffineValueMap &srcAccessMap, + const ValuePositionMap &valuePosMap, + FlatAffineConstraints *dependenceDomain) { + AffineMap srcMap = srcAccessMap.getAffineMap(); + unsigned numResults = srcMap.getNumResults(); + + ArrayRef srcOperands = srcAccessMap.getOperands(); + + std::vector> srcFlatExprs; + FlatAffineConstraints srcLocalVarCst; + if (failed(getFlattenedAffineExprs(srcMap, &srcFlatExprs, &srcLocalVarCst))) + return failure(); + + // Equality to add. + SmallVector eq(dependenceDomain->getNumCols()); + for (unsigned i = 0; i < numResults; ++i) { + // Zero fill. + std::fill(eq.begin(), eq.end(), 0); + + // Flattened AffineExpr for src result 'i'. + const auto &srcFlatExpr = srcFlatExprs[i]; + // Set identifier coefficients from src access function. + for (unsigned j = 0, e = srcOperands.size(); j < e; ++j) + eq[valuePosMap.getSrcDimOrSymPos(srcOperands[j])] = srcFlatExpr[j]; + + // Add equality constraint. + dependenceDomain->addEquality(eq); + } + return success(); +} + +static void +buildDimAndSymbolPositionMaps(const FlatAffineConstraints &srcDomain, + const AffineValueMap &srcAccessMap, + ValuePositionMap *valuePosMap, + FlatAffineConstraints *dependenceConstraints) { + auto updateValuePosMap = [&](ArrayRef values, bool isSrc) { + for (unsigned i = 0, e = values.size(); i < e; ++i) { + auto value = values[i]; + if (isSrc) { + valuePosMap->addSrcValue(value); + } + } + }; + + SmallVector srcValues, destValues; + srcDomain.getIdValues(0, srcDomain.getNumDimIds(), &srcValues); + // Update value position map with identifiers from src iteration domain. + updateValuePosMap(srcValues, /*isSrc=*/true); + // Update value position map with identifiers from src access function. + updateValuePosMap(srcAccessMap.getOperands(), /*isSrc=*/true); +} + +// Sets up dependence constraints columns appropriately +static void +initDependenceConstraints(const FlatAffineConstraints &srcDomain, + const AffineValueMap &srcAccessMap, + const ValuePositionMap &valuePosMap, + FlatAffineConstraints *dependenceConstraints) { + // Calculate number of equalities/inequalities and columns required to + // initialize FlatAffineConstraints for 'dependenceDomain'. + unsigned numIneq = srcDomain.getNumInequalities(); + AffineMap srcMap = srcAccessMap.getAffineMap(); + unsigned numEq = srcMap.getNumResults(); + unsigned numDims = srcDomain.getNumDimIds(); + unsigned numSymbols = 0 /*valuePosMap.getNumSymbols()*/; + unsigned numLocals = 0 /*srcDomain.getNumLocalIds()*/; + unsigned numIds = numDims + numSymbols + numLocals; + unsigned numCols = numIds + 1; + + // Set flat affine constraints sizes and reserving space for constraints. + dependenceConstraints->reset(numIneq, numEq, numCols, numDims, numSymbols, + numLocals); + + // Set values corresponding to dependence constraint identifiers. + SmallVector srcLoopIVs; + srcDomain.getIdValues(0, srcDomain.getNumDimIds(), &srcLoopIVs); + + dependenceConstraints->setIdValues(0, srcLoopIVs.size(), srcLoopIVs); +} + +static LogicalResult getInstIndexSet(Operation *op, + FlatAffineConstraints *indexSet) { + SmallVector loops; + getLoopIVs(*op, &loops); + return getIndexSet(loops, indexSet); +} + +// function that computes access matrix for an operation +void checkMemrefAccess( + const MemRefAccess &srcAccess, unsigned loopDepth, + FlatAffineConstraints *dependenceConstraints, + SmallVector *dependenceComponents) { + + // Get composed access function for 'srcAccess'. + AffineValueMap srcAccessMap; + srcAccess.getAccessMap(&srcAccessMap); + + // Get iteration domain for the 'srcAccess' operation. + FlatAffineConstraints srcDomain; + getInstIndexSet(srcAccess.opInst, &srcDomain); + + ValuePositionMap valuePosMap; + buildDimAndSymbolPositionMaps(srcDomain, srcAccessMap, &valuePosMap, + dependenceConstraints); + + initDependenceConstraints(srcDomain, srcAccessMap, valuePosMap, + dependenceConstraints); + + // assert(valuePosMap.getNumDims() == srcDomain.getNumDimIds()); + + addMemRefAccessConstraints(srcAccessMap, valuePosMap, dependenceConstraints); +} + +// This function creates access matrix for each load and store Operation +void LoopInterchange::createAccessMatrix(ArrayRef loadsAndStores) { + for (unsigned i = 0, e = loadsAndStores.size(); i < e; ++i) { + auto *srcOpInst = loadsAndStores[i]; + MemRefAccess srcAccess(srcOpInst); + + FlatAffineConstraints accessConstraints; + SmallVector accessComponents; + checkMemrefAccess(srcAccess, 1, &accessConstraints, &accessComponents); + col = accessConstraints.getNumCols(); + rows = accessConstraints.getNumEqualities(); + col = (col - 1); + std::vector> l2; + accessmat.push_back(l2); + for (unsigned r = 0; r < rows; r++) { + std::vector l3; + accessmat[i].push_back(l3); + for (unsigned c = 0; c < col; c++) { + accessmat[i][r].push_back(accessConstraints.atEq(r, c)); + } + } + } +} + +// Function to find the rank of access matrix +int compute_rank(std::vector> A) { + int n = A.size(); + int m = A[0].size(); + const double EPS = 1E-9; + int rank = 0; + std::vector row_selected(n, false); + for (int i = 0; i < m; ++i) { + int j; + for (j = 0; j < n; ++j) { + if (!row_selected[j] && std::abs(A[j][i]) > EPS) + break; + } + + if (j != n) { + ++rank; + row_selected[j] = true; + for (int p = i + 1; p < m; ++p) + A[j][p] /= A[j][i]; + for (int k = 0; k < n; ++k) { + if (k != j && std::abs(A[k][i]) > EPS) { + for (int p = i + 1; p < m; ++p) + A[k][p] -= A[j][p] * A[k][i]; + } + } + } + } + return rank; +} + +// This function will find out all the dependences, access matrices and ranks +// of operations present inside the loop floop +void LoopInterchange::find_dep_access(AffineForOp floop) { + loadsAndStores.clear(); + oper_type.clear(); + accessmat.clear(); + dependencemat.clear(); + rank_accmat.clear(); + + // Finding out all affine loadsAndStores operations + floop.walk([&](Operation *op) { + if (isa(op)) { + loadsAndStores.push_back(op); + oper_type.push_back('L'); + } else if (isa(op)) { + loadsAndStores.push_back(op); + oper_type.push_back('S'); + } + }); + + // This function will find out all access matrices and store them in accessmat + createAccessMatrix(loadsAndStores); + + // Finding out the rank of each access AccessMatrix and storing in rank_accmat + for (size_t i = 0, e = accessmat.size(); i < e; i++) { + std::vector> am; + for (size_t j = 0; j < accessmat[i].size(); j++) { + std::vector am2; + am.push_back(am2); + for (size_t k = 0; k < accessmat[i][j].size(); k++) { + am[j].push_back(accessmat[i][j][k]); + } + } + rank_accmat.push_back(compute_rank(am)); + } + + // This function will find out all the dependences in a loop nest + // and store them in dependencemat + checkDependences(loadsAndStores); + if (dependencemat.size() > 0) { + // Converting the dependencemat from int to double for rank calculation + std::vector> dm; + for (size_t i = 0, e = dependencemat.size(); i < e; i++) { + std::vector dm2; + dm.push_back(dm2); + for (size_t j = 0, f = dependencemat[i].size(); j < f; j++) { + dm[i].push_back(dependencemat[i][j]); + } + } + rank_depmat = compute_rank(dm); + } +} // end of the find_dep_access function + +// This function permutes the given loop nest for the given best permutation +void bestLoopPermute(std::vector int_per, AffineForOp forloop) { + SmallVector allforloops; + + getPerfectlyNestedLoops(allforloops, forloop); + + // conveting permutation from int to unsigned int + std::vector permut; + + for (int i = 0, e = int_per.size(); i < e; i++) { + std::vector::iterator it = + std::find(int_per.begin(), int_per.end(), i); + permut.push_back(std::distance(int_per.begin(), it)); + } + + unsigned outerIndex = permuteLoops(allforloops, permut); + outerIndex += 1; +} + +// This function checks whether the given loop nest is rectangular or not +// returns true if loop nest is rectangular +bool checkRect(AffineForOp floop) { + bool flag = true; + floop.walk([&](AffineForOp T) { + if (!(T.hasConstantBounds())) + flag = false; + }); + return flag; +} + +// This function makes the given imperfect loop nest a perfect loop nest +// works for imperfect loop nests assuming only 2 child loops in parallel +void makeperfect_nest(AffineForOp floop) { + // creating a for loop using opbuilder same as floop + OpBuilder opb(floop.getOperation()->getBlock(), + std::next(Block::iterator(floop.getOperation()))); + AffineForOp clone_op = static_cast(opb.clone(*floop)); + + // floop_child is the immediate child of floop and same is clone_op_child + AffineForOp floop_child, clone_op_child; + int iter = 0; + floop.walk([&](AffineForOp temp) { + Operation *par_op = temp.getParentOp(); + if (isa(*par_op) && + static_cast(par_op) == floop) { + floop_child = temp; + iter++; + } + }); + + floop_child.erase(); + + int iter2 = 0; + clone_op.walk([&](AffineForOp temp) { + Operation *par_op = temp.getParentOp(); + if (isa(*par_op) && + static_cast(par_op) == clone_op) { + iter2++; + if (iter2 == iter - 1) + temp.erase(); + } + }); +} + +// This function checks that the given loop nest is perfectly nested or not +bool checkperfectnest(AffineForOp floop) { + bool flag = false; + SmallVector all_loops; + Block *loopbody = floop.getBody(); + loopbody->walk([&](AffineForOp temp) { + Operation *par_temp = temp.getParentOp(); + if (isa(*par_temp) && + static_cast(par_temp) == floop) + all_loops.push_back(temp); + }); + all_loops.push_back(floop); + std::reverse(all_loops.begin(), all_loops.end()); + if (isPerfectlyNested(all_loops)) + flag = true; + + return flag; +} + +// This function checks whether the loop nest contains if statement or not +bool checkif(AffineForOp floop) { + bool flag = false; + Block *loopbody = floop.getBody(); + loopbody->walk([&](AffineIfOp temp) { flag = true; }); + return flag; +} + +// check for all the imperfect loop nests present inside the function fun and +// makes them perfect loop nests +void check_make_perfect_loop(FuncOp fun) { + bool flag; + do { + flag = true; + fun.walk([&](AffineForOp temp) { + // true if the loop nest is not perfect + if (!checkif(temp) && checkRect(temp) && !checkperfectnest(temp)) { + makeperfect_nest(temp); + flag = false; + } + }); + } while (!flag); +} + +void LoopInterchange::runOnFunction() { + FuncOp func = getFunction(); + // This function makes all the imperfect loop nests perfect loop nests + // present inside the function func + check_make_perfect_loop(func); + + // this variable contains the index of best permutaion in 3d vector valid perm + int bestPermIndex = -1; + + // Walk through all loops in a function in innermost-loop-first order. + func.walk([&](AffineForOp fop) { + // Finding the outermost loop + Operation *par_op = fop.getParentOp(); + if (isa(*par_op)) { + // fop is the outermost loop + bool if_present = checkif(fop); + bool rectangularLoopNest = checkRect(fop); + // This is true when there is no AffineIfOp inside the given loopnest and + // also it is rectangular + if (!(if_present) && rectangularLoopNest) { + // Call to the function which will find all the dependences and access + // matrices and their ranks inside loop body of fop + find_dep_access(fop); + + // generating all valid permutations of loopnest and storing into + // validperms + valid_perm(); + + if (validperms.size() > 1) { + // This function finds the most efficient permutation + bestPermIndex = best_perm(); + + // This function permutes the loop for the best possible permutation + bestLoopPermute(validperms[bestPermIndex], fop); + } + } + } + }); +} + +std::unique_ptr> mlir::createAffineLoopInterchangePass() { + return std::make_unique(); +} diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt index dddcc93adf0d6..88e1a6565a0d1 100644 --- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt @@ -7,6 +7,7 @@ add_mlir_dialect_library(MLIRAffineTransforms LoopUnrollAndJam.cpp SuperVectorize.cpp SimplifyAffineStructures.cpp + AffineLoopInterchange.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine