import multiIndexWarningTemplate from './findrelations_multi_index_warning.html';

import { fieldSpec } from 'ui/kibi/utils/field';
import { promiseMapSeries } from 'ui/kibi/utils/promise';

import { BaseModalProvider } from 'ui/kibi/modals/base_modal';
import { GetUUIDProvider } from 'ui/kibi/helpers/get_uuid';
import { JobCancelProvider } from 'ui/kibi/helpers/job_cancel';

import errors from 'request-promise/errors';
import Bluebird from 'bluebird';
import _ from 'lodash';


export function RelationsHelperProvider(Private, mappings, es, config, sessionId) {
  const baseModal = Private(BaseModalProvider);
  const getUUID = Private(GetUUIDProvider);
  const jobCancel = Private(JobCancelProvider);


  // JDBC queries have a default queue size of 40 - see ES thread pool
  // 'federate.connector.query'.

  const msearchLimit = 20;            // Half the pool's queue size

  // Maximum number of processed indices per index pattern in 'limited' mode
  const limitedModeIndicesPerPattern = 5;

  // Maximum size for terms included inside an msearch request for accurate
  // term matching. It should be set slightly short of the default 'maxPayloadBytes'
  // configuration parameter (1MB), to allow for JSON parameters and headers.
  const maxTermsListSize = 9e5 / msearchLimit;


  // Default regular expressions for interesting data categories
  const defaultRegExpStrings = {
    email: '^[\\w._%+-]+@[\\w.-]+\\.[a-zA-Z]{2,}$',
    uri: '^[a-zA-Z0-9+.-]+://',

    // Result of `ipRegex({ exact: true })` from npm package 'ip-regex'
    // eslint-disable-next-line max-len
    ip: '(?:^(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}$)|(?:^((?:[a-fA-F\\d]{1,4}:){7}(?:[a-fA-F\\d]{1,4}|:)|(?:[a-fA-F\\d]{1,4}:){6}(?:(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}|:[a-fA-F\\d]{1,4}|:)|(?:[a-fA-F\\d]{1,4}:){5}(?::(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}|(:[a-fA-F\\d]{1,4}){1,2}|:)|(?:[a-fA-F\\d]{1,4}:){4}(?:(:[a-fA-F\\d]{1,4}){0,1}:(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}|(:[a-fA-F\\d]{1,4}){1,3}|:)|(?:[a-fA-F\\d]{1,4}:){3}(?:(:[a-fA-F\\d]{1,4}){0,2}:(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}|(:[a-fA-F\\d]{1,4}){1,4}|:)|(?:[a-fA-F\\d]{1,4}:){2}(?:(:[a-fA-F\\d]{1,4}){0,3}:(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}|(:[a-fA-F\\d]{1,4}){1,5}|:)|(?:[a-fA-F\\d]{1,4}:){1}(?:(:[a-fA-F\\d]{1,4}){0,4}:(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}|(:[a-fA-F\\d]{1,4}){1,6}|:)|(?::((?::[a-fA-F\\d]{1,4}){0,5}:(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}|(?::[a-fA-F\\d]{1,4}){1,7}|:)))(%[0-9a-zA-Z]{1,})?$)'
  };


  /** Returns a unique string built from two input strings */
  function pairHash(first, second) { return `${first}//${second}`; }

  /** Returns a user-friendly text assembled from two input strings */
  function pairText(first, second) { return `(${first}) ${second}`; }

  /** Returns a unique string from a Field instance */
  function fieldHash(field) { return pairHash(field.indexPattern.title, field.name); }

  /**
   * Utility function to extract field fingerprints from the main fingerprints document
   *
   * @param {Object}  fps     Main fingerprints document to extract fingerprints from
   * @param {Field}   field   Field instance to extract fingerprints metadata for
   * @returns {Fingerprint}   Fingerprint metadata associated to the input field
   */
  function fieldFingerprint(fps, field) {
    return fps[field.indexPattern.title][field.name];
  }

  /**
   * Utility function to extract the IndexPattern associated to a SavedSearch instance
   */
  function ssearchIndexPattern(ssearch) {
    return ssearch.searchSource.get('index');
  }


  function indexHeader(index) {
    return {
      index,
      ignore_unavailable: true,
      preference: sessionId
    };
  }

  /**
   * Builds a request suitable for `es.msearch` consumption and composition.
   *
   * The resulting request body is an array with an isolated header at the
   * first position followed by the actual input request body.
   *
   * @param {String} index    Index pattern to be used in the header
   * @param {Object} body     Body of the actual (inner) request
   * @param {Object} [opts]   Additional parameters useful for tracking the request
   *
   * @returns {Object}        A single request converted to msearch format
   */
  function msearchRequest(index, body, opts = {}) {
    const timeout = config.get('siren:autoRelations:shardTimeout', 5000) + 'ms';

    return _.assign({
      body: [indexHeader(index), _.defaults(body, { timeout })]
    }, opts);
  }

  /**
   * Builds a terms request suitable for `es.msearch` consumption and composition.
   *
   * @param {Field} field         Field whose terms have to be extracted
   * @param {Number} termsCount   Number of terms to extract
   * @param {Object} opts         Additional parameters useful for tracking the request
   *
   * @returns {Object}            The desired terms request in msearch format
   */
  function termsRequest(field, termsCount, opts) {
    const query = {
      bool: { must: { exists: fieldSpec(field) } }
    };

    if (field.type === 'string') {
      // No empty strings
      query.bool.must_not = { term: { [field.name]: '' } };
    }

    return msearchRequest(field.indexPattern.title, {
      size: 0,
      query,
      aggs: {
        terms: {
          terms: _.assign({
            size: termsCount,
            shard_size: termsCount
          }, fieldSpec(field))
        }
      }
    }, opts);
  }

  function isExact(term) {
    return _.isString(term) || Number.isSafeInteger(term);
  }

  function exactMatchRequest(term, field, opts) {
    return msearchRequest(field.indexPattern.title, {
      size: opts.size || 0,
      query: { bool: { filter: {
        term: { [field.name]: term }
      } } }
    }, opts);
  }

  const longBounds = [ -Math.pow(2, 63) + 1024, Math.pow(2, 63) - 1 - 1024];

  function termRange(term) {
    const range = {};

    if (longBounds[0] < term) { range.gte = term - 1024; }
    if (term < longBounds[1]) { range.lte = term + 1024; }

    return range;
  }

  function approximateMatchRequest(term, field, opts) {
    // Number terms beyond Number.MAX_SAFE_INTEGER (2^53 - 1) are approximated
    // by Javascript during conversion from the network response to Javascript Number.
    //
    // The approximation can be roughly estimated - within < 2^n, numbers are represented
    // with a `gap` of at most 2^(n - 53) units. So, for 64 bit integers with sign
    // (ES longs), we have at most 2^(63 - 53) = 1024 unit gaps.
    //
    // Thefore, the true value of an approximated integer lies within 1024 units
    // above or below the approximated (input) value.

    return msearchRequest(field.indexPattern.title, {
      size: opts.size || 0,
      query: { bool: { filter: {
        range: { [field.name]: termRange(term) }
      } } }
    }, opts);
  }

  function exactMatchRequests(terms, field, opts) {
    return msearchRequest(field.indexPattern.title, {
      size: opts.size || 0,
      query: { bool: { filter: {
        terms: { [field.name]: terms }
      } } },
      aggs: {
        cardinality: { cardinality: {
          field: field.name,
          precision_threshold: 100
        } }
      }
    }, opts);
  }

  function approximateMatchRequests(terms, field, opts) {
    return msearchRequest(field.indexPattern.title, {
      size: opts.size || 0,
      query: { bool: { filter: {
        bool: { should: _.map(terms, term => ({
          range: { [field.name]: termRange(term) }
        })) }
      } } },
      aggs: {
        cardinality: { cardinality: {
          field: field.name,
          precision_threshold: 100
        } }
      }
    }, opts);
  }

  function matchRequest(term, field, opts) {
    return isExact(term)
      ? exactMatchRequest(term, field, opts)
      : approximateMatchRequest(term, field, opts);
  }

  function chunkTermsBySize(terms, field) {
    if (field.type !== 'string') { return [ terms ]; }

    const result = [];

    let currentChunk = [];
    let currentSize = 0;

    _.forEach(terms, function (term) {
      if (currentSize >= maxTermsListSize) {
        result.push(currentChunk);

        currentChunk = [];
        currentSize = 0;
      }

      currentSize += term.length;
      currentChunk.push(term);
    });

    if (currentChunk.length) {
      result.push(currentChunk);
    }

    return result;
  }

  function matchRequests(terms, field, opts) {
    const termsByIsExact = _(terms).map('key').groupBy(isExact).value();

    let result = [];

    if (termsByIsExact.true) {
      const termChunks = chunkTermsBySize(termsByIsExact.true, field);

      result = result.concat(termChunks.map(
        chunkTerms => exactMatchRequests(chunkTerms, field, opts)));
    }

    if (termsByIsExact.false) {
      // No need to chunk, these are all numbers
      result = result.concat(approximateMatchRequests(termsByIsExact.false, field, opts));
    }

    return result;
  }


  /**
   * Convenience wrapper for `es.msearch` that takes a canceling promise to
   * signal external cancelation of the request.
   *
   * @param {Object|Object[]} requests
   *    Requests to execute. The input value can either be an object with
   *    a `body` value (like es.msearch), or an array of requests with a
   *    `body` value, which will be conveniently merged into the actual
   *    msearch input.
   * @param {Promise} canceledPromise
   *    Additional promise that can be specified to cancel the request on resolve.
   *
   * @returns {Bluebird.Promise}
   *    Resolves to a list of all responses or rejects with the canceledPromise's
   *    resolved value.
   */
  function cancelableMSearch(requests, canceledPromise = Promise.race([])) {
    canceledPromise = Bluebird.resolve(canceledPromise);

    if (canceledPromise.isFulfilled()) {
      return Promise.reject(canceledPromise.value());
    }

    const body = _.isArray(requests)
      ? _(requests).map('body').flatten().value()
      : requests.body;

    const cancelId = getUUID.get();
    const request = es.msearch({ headers: { 'X-Opaque-Id': cancelId }, body });

    return Bluebird.race([ request, canceledPromise ])
      .then(function (allResp) {
        if (canceledPromise.isFulfilled()) {
          if (request.abort) { request.abort(); }
          jobCancel.cancel(cancelId);

          return Promise.reject(canceledPromise.value());
        }

        return allResp.responses;
      });
  }

  /**
   * Convenience wrapper for `es.msearch` that makes it similar to Promise.all,
   * meaning that the returned promise rejects with the first failure response.
   *
   * @param {Object|Object[]} requests
   *    Requests to execute. The input value can either be an object with
   *    a `body` value (like es.msearch), or an array of requests with a
   *    `body` value, which will be conveniently merged into the actual
   *    msearch input.
   * @param {Promise} canceledPromise
   *    Additional promise that can be specified to cancel the request on resolve.
   *
   * @returns {Bluebird.Promise}
   *    Resolves to a list of all responses, rejects with an error (either
   *    on the request itself, or the first of its sub-requests) or rejects with
   *    the canceledPromise's resolved value.
   */
  function msearchAll(requests, canceledPromise = Promise.race([])) {
    return cancelableMSearch(requests, canceledPromise)
      .then(responses => {
        let error;

        _.forEach(responses, function (resp) {
          if (resp.error) {
            error = Promise.reject(new errors.StatusCodeError(resp.status, resp));
            return false;
          }
        });

        return error || responses;
      });
  }

  function fieldsToIndexPatterns(fields) {
    return _(fields)
      .map('indexPattern')
      .uniq()
      .value();
  }

  /**
   * Computes multifield relationships in input index data
   *
   * @param {IndexData} indexData
   *    Formatted index pattern data for auto-relations/fingerprint consumption,
   *    will be mutated in output
   */
  function addMultifieldsHierarchy(indexData) {
    const { fields: fieldsData } = indexData;

    const multifieldsByName = _(fieldsData)
      .map(fieldData => fieldData.orig.multifields)
      .flatten()
      .indexBy('name')
      .value();

    const fieldsByMultifieldStatus = _.reduce(fieldsData, function (memo, fieldData) {
      if (fieldData.orig.multifields.length) { memo.parent.push(fieldData); }
      if (multifieldsByName[fieldData.fName]) { memo.multifield.push(fieldData); }

      return memo;
    }, { parent: [], multifield: [] });

    _.forEach(fieldsByMultifieldStatus.multifield, fieldData => {
      multifieldsByName[fieldData.fName] = fieldData;
    });

    _.forEach(fieldsByMultifieldStatus.parent, parent => {
      const multifields = _.map(parent.orig.multifields,
        mfield => multifieldsByName[mfield.name]);

      parent.multifields = multifields;
      _.forEach(multifields, mfData => { mfData.parent = parent; });
    });

    _.assign(indexData, { fieldsByMultifieldStatus });
  }

  /**
   * In case any of the specified index patterns have a high number of indices,
   * this function will prompt the user to choose if further analysis should
   * happen in 'full' mode over all indices or in 'limited' mode over just a
   * handful of indices.
   *
   * @param {IndexPattern[]} indexPatterns
   *    List of index patterns to check
   * @param {Object} indicesByPattern
   *    A hash of index patterns to matched indices lists
   * @returns {Promise}
   *    Promise resolving to the maximum number of indices per pattern to
   *    analyze or undefined if no maximum number is required. Can reject with
   *    no value in case users decided to avoid the question.
   */
  function chooseMaxIndicesPerPattern(indexPatterns, indicesByPattern) {
    const maxFieldsTimesIndices = _(indexPatterns)
      .map(indexPattern => {
        const indicesCount = indicesByPattern[indexPattern.title].length;

        return (indicesCount > limitedModeIndicesPerPattern)
          ? indicesCount * indexPattern.fields.length
          : 0;  // Ignore multi-indexes with less indices than limited mode cut
      })
      .max();

    // We're skipping the modal warning if we have few enough indices/fields
    if (maxFieldsTimesIndices < 25 * limitedModeIndicesPerPattern) {
      return Promise.resolve();     // undefined = no max indices per pattern
    }


    const modal = baseModal(multiIndexWarningTemplate, _.assign({
      overlayClass: 'kibi-findrel-overlay'
    }));

    return modal.show()
      .then(mode => {
        if (!mode) { return Promise.reject(); }
        return mode === 'full' ? undefined : limitedModeIndicesPerPattern;
      });
  }


  /**
   * Returns the number of indexes associated to the specified index patterns.
   *
   * @param {IndexPattern[]} indexPatterns    List of index patterns to process
   * @returns {Object}                        Hash of pattern names to indices list
   */
  function getIndicesByPattern(indexPatterns) {
    return Bluebird.map(indexPatterns, idxp => mappings.getMapping(idxp.title))
      .then(maps => _(maps)
        .indexBy((indicesHash, idp) => indexPatterns[idp].title)
        .mapValues(indicesHash => _.map(indicesHash, (maps, indexTitle) => indexTitle))
        .value());
  }


  function msearchRequestsSegmenter(indicesByPattern, opts) {
    const weightByPattern = _.mapValues(indicesByPattern, indices => indices.length);
    const maxWeight = msearchLimit;

    function flushChunks(memo) {
      const { currChunk, currWeight } = memo;

      if (currChunk.length) { memo.chunks.push(currChunk); }

      memo.currChunk = [];
      memo.currWeight = 0;
    }

    function newAccumulator() {
      return { chunks: [], currChunk: [], currWeight: 0 };
    }

    function accumulateChunks(memo, req) {
      const indexPattern = req.body[0].index;
      const reqWeight = weightByPattern[indexPattern];

      if (memo.currWeight + reqWeight > maxWeight) { flushChunks(memo); }

      memo.currWeight += reqWeight;
      memo.currChunk.push(req);

      return memo;
    }

    function array(requests) {
      const accum = _.reduce(requests, accumulateChunks, newAccumulator());

      flushChunks(accum);
      return accum.chunks;
    };

    function* generator(gen) {
      const accum = newAccumulator();

      while (true) {
        const next = gen.next();
        if (next.done) { break; }

        accumulateChunks(accum, next.value);
        if (accum.chunks.length) { yield accum.chunks.shift(); }
      }

      flushChunks(accum);
      if (accum.chunks.length) { yield accum.chunks.shift(); }
    }


    return { array, generator };
  }

  function multiIndexRequestsSampler(indicesByPattern, maxIndicesPerPattern) {
    indicesByPattern = _.clone(indicesByPattern);

    _.forEach(indicesByPattern, indices => {
      if (indices.length <= maxIndicesPerPattern) { return; }

      indices = indices.slice(0, maxIndicesPerPattern);
      indicesByPattern[indices.join(',')] = indices;
    });

    function map(req) {
      const indices = indicesByPattern[req.body[0].index];
      if (indices.length <= maxIndicesPerPattern) { return req; }

      req = _.clone(req);
      req.body = req.body.slice();
      req.body[0].index = indices.slice(0, maxIndicesPerPattern).join(',');

      return req;
    }

    return { indicesByPattern, map };
  }

  /**
   * Builds a _mediator_ for msearch requests, essentially required to make
   * sure that the simultaneous workload of msearch requests doesn't exceed the
   * limits of the backend architecture.
   *
   * Examples of backend limitations include the hard maximum limit of
   * per-shard requests in a ES cluster's scheduling node, which is
   * particularly troublesome for index patterns matching multiple indexes.
   *
   * @param {Object} indicesByPattern
   *    A hash of index patterns to matched indices lists
   * @param {Integer}  [maxIndicesPerPattern]
   *    Maximum number of indexes queried per index pattern. Chosen indices are
   *    the first in the indicesByPattern parameter.
   *
   * @returns {Object}
   *    An object with mediators for msearch request `array`s and `generator`s
   */
  function makeRequestsMediator(indicesByPattern, maxIndicesPerPattern) {
    let idxSampler = { map: _.identity };

    if (maxIndicesPerPattern > 0) {
      idxSampler = multiIndexRequestsSampler(indicesByPattern, maxIndicesPerPattern);
      indicesByPattern = idxSampler.indicesByPattern;
    }

    const { map } = idxSampler;
    const segmenter = msearchRequestsSegmenter(indicesByPattern);

    function array(arr) { return segmenter.array(arr.map(map)); }

    const generator = _.flowRight(segmenter.generator, function* (gen) {
      for (let next = gen.next(); !next.done; next = gen.next()) {
        yield map(next.value);
      }
    });

    return { map, array, generator };
  }


  /**
   * Retrieves terms from the specified field.
   *
   * @param {Field} field
   *    Field to retrieve terms from
   * @param {Number} termsCount
   *    Max number of terms to retrieve
   * @param {Object} mediator
   *    Mediator responsible for preparing request groups for msearch concurrency
   * @param {Object} opts
   *    Additional optional parameters
   * @param {Promise} [opts.canceledPromise]
   *    Additional promise that can be specified to cancel the request on resolve.
   *
   * @returns {Bluebird.Promise}
   *    Resolves to the list of terms, rejects with an error or rejects with
   *    the canceledPromise's resolved value.
   */
  function getTerms(field, termsCount, mediator, opts) {
    const req = mediator.map(termsRequest(field, termsCount));

    return msearchAll([ req ], opts.canceledPromise)
      .then(([ resp ]) => _.map(resp.aggregations.terms.buckets, 'key'));
  }

  /**
   * Retrieve a document for each of the input terms matching the specified field.
   *
   * @param {Field[]} fields
   *    Fields where that specified terms have to match
   * @param {String[]} terms
   *    Terms to find a document each
   * @param {Object} mediator
   *    Mediator responsible for preparing request groups for msearch concurrency
   * @param {Object} opts
   *    Additional optional parameters
   * @param {Promise} [opts.canceledPromise]
   *    Additional promise that can be specified to cancel the request on resolve.
   * @param {Function} [opts.onTermDocsReceived]
   *    Callback invoked when the documents of some term have been retrieved.
   *    The callback receives the current snapshot of all retrieved termDocs.
   *
   * @returns {Bluebird.Promise}
   *    Resolves to nested lists of documents, one list per term and each list
   *    with a document (or undefined if not matched) per field. Can reject with
   *    an error or with the canceledPromise's resolved value.
   */
  function getTermDocs(fields, terms, mediator, opts) {
    const { onTermDocsReceived = _.noop } = opts;
    const lastF = fields.length - 1;
    const result = [];

    const emptyArray = () => [];

    function processResponse(req, resp) {
      const { f, t } = req;
      const doc = resp.hits.hits[0];

      if (!f) { result[t] = _.map(fields, emptyArray); }

      result[t][f] = doc;

      if (f === lastF) { onTermDocsReceived(result.slice()); }
    }

    function processRequestChunk(reqChunk) {
      return msearchAll(reqChunk, opts.canceledPromise)
        .each((resp, r) => processResponse(reqChunk[r], resp));
    }


    // Compile requests grouped by term, so we can ship termDocs as they get
    // retrieved
    const requests = _(fields)
      .map((field, f) =>
        _.map(terms, (term, t) => matchRequest(term, field, { size: 1, f, t })))
      .thru(fieldReqs => _.zip(...fieldReqs))
      .flatten()
      .value();

    const reqChunks = mediator.array(requests);

    return promiseMapSeries(reqChunks, processRequestChunk)
      .then(() => result);
  }


  /**
   * @typedef {Object} Link
   * @property {Field} source           Source endpoint
   * @property {Field} target           Target endpoint
   * @property {Number} score           Average docs matched per term
   */

  /**
   * Calculates links between target fields and other source fields, by
   * extracting some source terms and checking whether they exist in each target
   * by network request.
   *
   * @param {Array}   targetFields
   *    Destination fields where term lookup requests will take place
   * @param {Array}   sourceFields
   *    Source fields from which terms will be extracted
   * @param {Object}  requestsMediator
   *    Mediator responsible for preparing request groups for msearch concurrency
   * @param {Object}  [opts]
   *    Additional options
   *
   * @param {Number}  [opts.termsCount=200]
   *    Number of terms retrieved from source candidates for match testing
   * @param {Number}  [opts.requiredMatchesRatio=1]
   *    Ignore links with a match ratio lower than this value
   * @param {Object}  [opts.progress]
   *    Progress bar data for progress notifications
   * @param {Object}  [opts.progressSize=1]
   *    Desired size of progress for the whole function
   * @param {Function}  [opts.onError]
   *    Function to be invoked when a network error is received. If undefined,
   *    the error will be thrown as exception. If defined, the supplied
   *    function will be invoked with the request generating the error and the
   *    error response, and the procedure will resume.
   *
   * @returns {Link[]}
   *    Found links between the specified input fields
   */
  function accurateTermsMatching(targetFields, sourceFields, requestsMediator, opts) {
    opts = _.defaults({}, opts, {
      termsCount: 200,
      requiredMatchesRatio: 1,
      progressSize: 1,
      onError: (req, resp) => { throw resp.error; }
    });

    const { progress, progressSize, onError } = opts;

    const opsCount =
      sourceFields.length +                             // Terms retrieval
      sourceFields.length * targetFields.length;        // Terms matching

    _.assign(opts, {
      targetFields,
      sourceFields,
      requestsMediator,
      progInc: progressSize / opsCount
    });


    // Binding 'this', we want to use the cancelableMSearch defined in the
    // containing objects for testing purposes
    const evalTermsRequests = (args, requests) => {
      const { fieldsData, termsCount, requiredMatchesRatio, onError } = args;

      return this.cancelableMSearch(requests, progress.canceledPromise)
        .each(function (resp, r) {
          const request = requests[r];

          const { field } = request;
          const fHash = fieldHash(field);

          let terms;
          if (resp.error) {
            onError(request, resp);
            terms = [];
          } else {
            terms = resp.aggregations.terms.buckets;
          }

          fieldsData[fHash] = { field, terms };
        });
    };

    function retrieveTerms(args) {
      const { termsCount, progress, progInc, requestsMediator } = args;

      args.fieldsData = {};

      const termsRequestChunks = _(sourceFields)
        .map(field => termsRequest(field, termsCount, {
          context: 'accurate-match-terms-get', field
        }))
        .thru(requestsMediator.array)
        .value();

      return promiseMapSeries(termsRequestChunks, (reqChunk, c) => {
        if (progress && !progress.notifyStart(
          `Retrieving terms (${c + 1}/${termsRequestChunks.length})`,
          reqChunk.length * progInc)) {
          return Promise.reject();
        }

        return evalTermsRequests(args, reqChunk);
      })
        .then(() => args);
    }

    function keyMatchRegister() {
      return { termsFound: 0 };
    }

    function* generateKeyMatchRequests(results, opts) {
      const {
        targetFields, sourceFields, fieldsData,
        progress, progInc
      } = opts;

      for (let k = 0; k < targetFields.length; ++k) {
        const targetField = targetFields[k];
        const kHash = fieldHash(targetField);

        const keyResults = results[kHash] = _(sourceFields)
          .indexBy(fieldHash)
          .mapValues(keyMatchRegister)
          .value();

        for (let f = 0; f < sourceFields.length; ++f) {
          if (progress) {
            progress.notifyStart(`Matching terms (${k + 1}/${targetFields.length})`, progInc);
          }

          const field = sourceFields[f];
          const fHash = fieldHash(field);
          const fieldData = fieldsData[fHash];
          const keyResult = keyResults[fHash];

          if (targetField.type !== field.type ||
            targetField.esType !== field.esType ||
            kHash === fHash) { continue; }

          const { terms } = fieldData;
          const requests = matchRequests(terms, targetField, {
            context: 'accurate-match-test', keyResults, fieldData
          });

          for (const req of requests) { yield req; }
        }
      }
    }

    const evalMultipleKeyMatchRequests = (results, requests, args) => {
      const { progress } = args;
      if (progress && progress.canceled) { return Promise.reject(); }

      if (!requests.length) { return Promise.resolve(); }

      return this.cancelableMSearch(requests, progress.canceledPromise)
        .each(function (resp, r) {
          const request = requests[r];

          if (resp.error) {
            onError(request, resp);
            return;
          }

          const { keyResults, fieldData } = request;

          const fHash = fieldHash(fieldData.field);
          const keyResult = keyResults[fHash];

          keyResult.termsFound += resp.aggregations.cardinality.value;
        });
    };

    function matchTerms(args) {
      const { requestsMediator } = args;

      const results = {};
      const matchGenerator = requestsMediator.generator(
        generateKeyMatchRequests(results, args));

      function doEvalMatches() {
        const next = matchGenerator.next();

        return next.done || evalMultipleKeyMatchRequests(results, next.value, args)
          .then(doEvalMatches);
      }

      return Promise.resolve()
        .then(doEvalMatches)
        .then(() => {
          args.matches = results;
          return args;
        });
    }

    function formatRelations(args) {
      const { fieldsData, matches, requiredMatchesRatio } = args;
      const result = [];

      _.forEach(targetFields, target => {
        const tHash = fieldHash(target);

        _.forEach(sourceFields, source => {
          const sHash = fieldHash(source);
          const sourceData = fieldsData[sHash];

          const termsCount = sourceData.terms.length;
          if (!termsCount) { return; }

          const { termsFound } = matches[tHash][sHash];
          const termsRatio = termsFound / termsCount;

          if (termsRatio < requiredMatchesRatio) { return; }

          const score = 100 * Math.min(Math.max(termsRatio, 0), 1);

          result.push({ source, target, score });
        });
      });

      return result;
    }


    if (!targetFields.length || !sourceFields.length) {
      if (progress) { progress.notifyStart('', progressSize); }
      return Promise.resolve([]);
    }

    return Promise.resolve(opts)
      .then(retrieveTerms)
      .then(matchTerms)
      .then(formatRelations);
  }

  /**
   * Splits EIDs linked to multiple fields with different ES types into
   * multiple EIDs, one per type.
   *
   * @param {Link[]} links
   *    Links to analyze
   * @returns {Link[]}
   *    Input links, except EIDs are all linked to a single ES type
   */
  function splitLinkEidsByType(links) {
    const [ eidLinks, directLinks ] = _.partition(links, 'target.eid');

    const eidLinksByTargetAndType = _(eidLinks)
      .groupBy('target.eid')
      .map(linksGroup => _(linksGroup)
        .groupBy('source.esType')
        .values()
        .value())
      .value();

    const processedEidLinks = _.reduce(eidLinksByTargetAndType,
      function addTypeToEidNameIfNecessary(memo, groupsByTarget) {
        const hasMultipleTypes = groupsByTarget.length > 1;
        const targetLinks = _.flatten(groupsByTarget);

        if (hasMultipleTypes) {
          _.forEach(targetLinks, link => {
            link.target.eid = `${link.target.eid}__${link.source.esType}`;
          });
        }

        return memo.concat(targetLinks);
      }, []);

    return directLinks.concat(processedEidLinks);
  }

  /**
   * Serial numeric identifiers are a special case that is hard to process by
   * values alone. All serials have overlapping range from 0 upward and values
   * are continuous and therefore often match - so they all tend to be related together,
   * which is not ideal.
   *
   * To alleviate the issue we want to at least isolate links whose endpoints
   * follow the frequent pattern, where the target field name is the starting or
   * ending part of the source field name:
   *
   *    Example:  target_index.key_id  <==  source_index.foreign_key_id
   *
   * @param {Link[]} links
   *    Input links to process
   *
   * @returns {Link[][2]}
   *    Pair with the retained and removed links
   */
  function isolateSerialEndpointsWithMatchingNames(links, fps) {
    function isSerial(field) {
      if (field.type !== 'number') { return false; }

      const fingerprint = fieldFingerprint(fps, field);

      return !fingerprint.tags.hash_num && fingerprint.attributes.range[0] >= -10;

      // The -10 in range checks is because of real-life cases (cfr. sandbox:chembl-*)
      // where we have special values, like -1 (not found) ingested from sources that
      // don't have null values.
    }

    return _.partition(links, link => {
      const { source, target } = link;

      const sourceName = source.name.toLowerCase();
      const targetName = target.name.toLowerCase();

      return !isSerial(target) ||
        sourceName.startsWith(targetName) || sourceName.endsWith(targetName);
    });
  }

  /**
   * Sort links by relevance and then extract the less relevant links making closed
   * link chains. Retained and removed links are returned inside an array pair.
   *
   * @param {Link[]} links
   *    Input links to process
   *
   * @returns {Link[][2]}
   *    Pair with the retained and removed links
   */
  function breakDirectLinkLoops(links, fps) {
    const targetCounts = _.reduce(links, function (memo, link) {
      const hash = fieldHash(link.target);
      memo[hash] = (memo[hash] || 0) + 1;

      return memo;
    }, {});

    const sortedLinksByTarget = _.sortByOrder(links, [
      link => targetCounts[fieldHash(link.target)],
      link => {
        const fingerprint = fieldFingerprint(fps, link.target);
        const uniqueVal = +!!fingerprint.tags.unique;

        // Cardinality score is boosted by field uniqueness
        return fingerprint.attributes.cardinality * (1 + 0.1 * uniqueVal);
      }
    ], [ 'desc', 'desc' ]);

    const foundEndpoints = {};

    return _.partition(sortedLinksByTarget, link => {
      const srcHash = fieldHash(link.source);
      const dstHash = fieldHash(link.target);

      const keepLink = !(foundEndpoints[srcHash] && foundEndpoints[dstHash]);

      foundEndpoints[srcHash] = true;
      foundEndpoints[dstHash] = true;

      return keepLink;
    });
  }

  /**
   * Direct links between fields in the same index pattern are better rendered as
   * relations passing through an EID, *unless* the target field is a primary/unique key
   * of the index pattern.
   *
   * This function will extract and allocate an EID for these cases.
   *
   * @param {Link[]} links
   *    Links to operate on
   * @param {Fingerprints} fps
   *    Fingerprint metadata hashed by index pattern and field
   *
   * @returns {Link[]}
   *    Modified input links, with Entity Identifiers added when required to replace
   *    index pattern loops
   */
  function splitDirectLinksWithAddedEid(links, fps) {
    const linksByTarget = _.groupBy(links, link => fieldHash(link.target));

    function preserveLinks(linksGroup) {
      const targetFingeprint = fieldFingerprint(fps, linksGroup[0].target);

      return targetFingeprint.tags.unique ||
        _.all(linksGroup, link =>
          link.target.indexPattern.title !== link.source.indexPattern.title);
    }

    const linkGroupsByLoop = _.partition(linksByTarget, preserveLinks);

    linkGroupsByLoop[1] = _.map(linkGroupsByLoop[1], linksGroup => {
      return [{
        source: linksGroup[0].target,
        target: { eid: linksGroup[0].target.name },
        score: 100
      }].concat(linksGroup.map(link => ({
        source: link.source,
        target: { eid: link.target.name },
        score: link.score
      })));
    });

    return linkGroupsByLoop.map(_.flatten);
  }

  /**
   * Returns a link with endpoints transformed by the specified map function.
   * The map function will receive the endpoint and the link as arguments.
   *
   * @param {Link}      link        Link to hydrate
   * @param {Function}  mapFn       Mapping function for the link endpoints
   *
   * @returns {Link}  Hydrated link, with endpoints transformed by the mapping function
   */
  function hydrateLink(link, mapFn) {
    return _.defaults({
      source: mapFn(link.source, link),
      target: mapFn(link.target, link)
    }, link);
  }


  return {
    defaultRegExpStrings,

    pairHash,
    pairText,
    fieldHash,
    fieldFingerprint,
    ssearchIndexPattern,

    addMultifieldsHierarchy,
    chooseMaxIndicesPerPattern,

    msearchRequest,
    termsRequest,
    cancelableMSearch,
    msearchAll,

    getIndicesByPattern,
    makeRequestsMediator,

    getTerms,
    getTermDocs,

    accurateTermsMatching,

    splitLinkEidsByType,
    isolateSerialEndpointsWithMatchingNames,
    breakDirectLinkLoops,
    splitDirectLinksWithAddedEid,

    hydrateLink
  };
}
