worldmonitor/tests/pipeline-dedup.test.mjs

// @ts-check
//
// Tests for scripts/_pipeline-dedup.mjs — the haversine + Jaccard dedup
// helper. Both criteria (≤5km AND ≥0.6) must hold for a match. Existing rows
// always win to preserve hand-curated evidence.

import { strict as assert } from 'node:assert';
import { test, describe } from 'node:test';
import { dedupePipelines, _internal } from '../scripts/_pipeline-dedup.mjs';

const { jaccard, averageEndpointDistanceKm, tokenize, uniqueId } = _internal;

function makePipeline(id, name, startLat, startLon, endLat, endLon) {
  return {
    id,
    name,
    startPoint: { lat: startLat, lon: startLon },
    endPoint: { lat: endLat, lon: endLon },
  };
}

describe('pipeline-dedup — internal helpers', () => {
  test('tokenize lowercases, splits, drops stopwords', () => {
    const tokens = tokenize('Trans-Siberian Pipeline System');
    assert.deepEqual(tokens.sort(), ['siberian', 'trans']);
  });

  test('tokenize removes punctuation and accents', () => {
    const tokens = tokenize('Caño Limón–Coveñas Pipeline');
    // After NFKD normalization + ascii-only filter, accented chars survive
    // as their base letter; we accept either exact or close behaviour.
    assert.ok(tokens.includes('limon') || tokens.includes('lim'),
      `expected Limón to tokenize; got ${tokens.join(',')}`);
  });

  test('jaccard returns 1.0 for identical token sets', () => {
    assert.equal(jaccard('Test Pipeline System', 'Test Pipeline'), 1.0);
  });

  test('jaccard returns 0 for fully disjoint names', () => {
    assert.equal(jaccard('Druzhba North', 'Nord Stream'), 0);
  });

  test('jaccard 0.5 for half-overlap', () => {
    assert.equal(jaccard('Trans Adriatic', 'Trans Caspian'), 1 / 3);
  });

  test('haversine distance is symmetric', () => {
    const a = makePipeline('a', 'A', 60, 30, 54, 13);
    const b = makePipeline('b', 'B', 60.001, 30.001, 54.001, 13.001);
    assert.ok(averageEndpointDistanceKm(a, b) < 1, 'sub-km on tiny offsets');
  });

  test('haversine distance for far-apart pipelines is large', () => {
    const a = makePipeline('a', 'A', 60, 30, 54, 13);  // RU→DE
    const b = makePipeline('b', 'B', 30, -90, 25, -85); // Gulf of Mexico
    assert.ok(averageEndpointDistanceKm(a, b) > 5000);
  });

  test('uniqueId preserves base when free, suffixes when taken', () => {
    const taken = new Set(['foo', 'foo-2']);
    assert.equal(uniqueId('bar', taken), 'bar');
    assert.equal(uniqueId('foo', taken), 'foo-3');
  });
});

describe('pipeline-dedup — match logic', () => {
  test('happy path: completely-different name + far endpoints → added', () => {
    const existing = [makePipeline('druzhba-north', 'Druzhba Pipeline (Northern Branch)',
      52.6, 49.4, 52.32, 14.06)];
    const candidates = [makePipeline('nord-stream-1', 'Nord Stream 1',
      60.08, 29.05, 54.14, 13.66)];
    const { toAdd, skippedDuplicates } = dedupePipelines(existing, candidates);
    assert.equal(toAdd.length, 1);
    assert.equal(skippedDuplicates.length, 0);
  });

  test('match by both criteria: close endpoints + similar name → skipped (existing wins)', () => {
    const existing = [makePipeline('druzhba-north', 'Druzhba Pipeline',
      52.6, 49.4, 52.32, 14.06)];
    const candidates = [makePipeline('druzhba-import', 'Druzhba Pipeline',
      52.601, 49.401, 52.321, 14.061)];
    const { toAdd, skippedDuplicates } = dedupePipelines(existing, candidates);
    assert.equal(toAdd.length, 0);
    assert.equal(skippedDuplicates.length, 1);
    assert.equal(skippedDuplicates[0].matchedExistingId, 'druzhba-north');
  });

  test('identical names + one shared terminus (≤25 km) → deduped (PR #3406 Dampier-Bunbury regression)', () => {
    // Real-world case from PR #3406 review: GEM digitized only the southern
    // 60% of the line, so the shared Bunbury terminus matched at 13.7 km
    // but the average-endpoint distance was 287 km (over the 5 km gate).
    // Identical token sets + ≥1 close pairing = same physical pipeline.
    const existing = [makePipeline('dampier-bunbury', 'Dampier to Bunbury Natural Gas Pipeline',
      -20.68, 116.72, -33.33, 115.63)];
    const candidates = [makePipeline('dampier-to-bunbury-natural-gas-pipeline-au',
      'Dampier to Bunbury Natural Gas Pipeline',
      -33.265797, 115.755682, -24.86854, 113.674968)];
    const { toAdd, skippedDuplicates } = dedupePipelines(existing, candidates);
    assert.equal(toAdd.length, 0);
    assert.equal(skippedDuplicates.length, 1);
    assert.equal(skippedDuplicates[0].matchedExistingId, 'dampier-bunbury');
  });

  test('name-match only (endpoints in different ocean) → added', () => {
    const existing = [makePipeline('nord-stream-1', 'Nord Stream 1',
      60.08, 29.05, 54.14, 13.66)];
    const candidates = [makePipeline('imposter', 'Nord Stream 1',
      40.0, -100.0, 35.0, -90.0)]; // different continent
    const { toAdd, skippedDuplicates } = dedupePipelines(existing, candidates);
    assert.equal(toAdd.length, 1, 'low haversine confidence overrides high name match');
    assert.equal(skippedDuplicates.length, 0);
  });

  test('endpoint-match only (different name) → added (real distinct pipelines can share endpoints)', () => {
    const existing = [makePipeline('yamal-europe', 'Yamal–Europe',
      67.0, 75.0, 52.0, 14.0)];
    const candidates = [makePipeline('different-route', 'Trans-Siberian Coal Slurry',
      67.001, 75.001, 52.001, 14.001)];
    const { toAdd } = dedupePipelines(existing, candidates);
    assert.equal(toAdd.length, 1, 'name disambiguates: same endpoints, different infrastructure');
  });

  test('reverse-direction match: candidate endpoints flipped → still detected', () => {
    const existing = [makePipeline('druzhba', 'Druzhba',
      52.6, 49.4, 52.32, 14.06)];
    // Same pipeline, route described in reverse direction
    const candidates = [makePipeline('druzhba-flipped', 'Druzhba',
      52.32, 14.06, 52.6, 49.4)];
    const { toAdd, skippedDuplicates } = dedupePipelines(existing, candidates);
    assert.equal(toAdd.length, 0);
    assert.equal(skippedDuplicates.length, 1);
  });

  test('stopword-only difference: "Pipeline System" vs "Line" → matches by Jaccard', () => {
    const existing = [makePipeline('trans-sib', 'Trans-Siberian Pipeline System',
      55, 30, 60, 90)];
    const candidates = [makePipeline('trans-sib-cand', 'Trans-Siberian Line',
      55.001, 30.001, 60.001, 90.001)];
    const { toAdd, skippedDuplicates } = dedupePipelines(existing, candidates);
    assert.equal(toAdd.length, 0);
    assert.equal(skippedDuplicates.length, 1);
    assert.ok(skippedDuplicates[0].jaccard >= 0.6);
  });
});

describe('pipeline-dedup — id collision', () => {
  test('candidate with id colliding existing gets suffixed -2', () => {
    const existing = [makePipeline('foo', 'Foo Pipeline', 0, 0, 1, 1)];
    const candidates = [makePipeline('foo', 'Bar Pipeline', 50, 50, 60, 60)];
    const { toAdd } = dedupePipelines(existing, candidates);
    assert.equal(toAdd.length, 1);
    assert.equal(toAdd[0].id, 'foo-2');
  });

  test('three candidates colliding the same existing id get -2, -3, -4', () => {
    const existing = [makePipeline('foo', 'Foo Pipeline', 0, 0, 1, 1)];
    const candidates = [
      makePipeline('foo', 'Bar Pipeline', 50, 50, 60, 60),
      makePipeline('foo', 'Baz Pipeline', 70, 70, 80, 80),
      makePipeline('foo', 'Qux Pipeline', 30, -30, 40, -40),
    ];
    const { toAdd } = dedupePipelines(existing, candidates);
    assert.equal(toAdd.length, 3);
    assert.deepEqual(
      toAdd.map((p) => p.id).sort(),
      ['foo-2', 'foo-3', 'foo-4'],
    );
  });
});

describe('pipeline-dedup — determinism', () => {
  test('two invocations on identical inputs produce identical output', () => {
    const existing = [
      makePipeline('a', 'Alpha Pipeline', 10, 10, 20, 20),
      makePipeline('b', 'Beta Pipeline', 30, 30, 40, 40),
    ];
    const candidates = [
      makePipeline('a', 'Alpha Pipeline', 10.001, 10.001, 20.001, 20.001),
      makePipeline('c', 'Gamma Pipeline', 50, 50, 60, 60),
    ];
    const r1 = dedupePipelines(existing, candidates);
    const r2 = dedupePipelines(existing, candidates);
    assert.deepEqual(
      r1.toAdd.map((p) => p.id),
      r2.toAdd.map((p) => p.id),
    );
    assert.deepEqual(
      r1.skippedDuplicates.map((d) => d.matchedExistingId),
      r2.skippedDuplicates.map((d) => d.matchedExistingId),
    );
  });
});

describe('pipeline-dedup — within-batch dedup (review fix)', () => {
  test('two candidates that match each other but not any existing → only first is added', () => {
    // Regression: pre-fix, dedup compared each candidate ONLY against the
    // original `existing` array, so two GEM rows for the same pipeline (e.g.
    // a primary entry and a duplicate from a different source spreadsheet)
    // would BOTH end up in the registry.
    const candidates = [
      makePipeline('east-west-saudi', 'East-West Crude Pipeline', 25, 49, 24, 38),
      // Same pipeline, slightly different name + endpoints (within match
      // tolerance). Should be skipped as a duplicate of the first candidate.
      makePipeline('saudi-petroline', 'East-West Crude', 25.001, 49.001, 24.001, 38.001),
    ];
    const { toAdd, skippedDuplicates } = dedupePipelines([], candidates);
    assert.equal(toAdd.length, 1, 'second matching candidate must be skipped');
    assert.equal(skippedDuplicates.length, 1);
    assert.equal(toAdd[0].id, 'east-west-saudi', 'first-accepted candidate wins (deterministic)');
    assert.equal(skippedDuplicates[0].matchedExistingId, 'east-west-saudi',
      'skipped candidate matches the earlier-accepted one, not anything in `existing`');
  });

  test('three candidates with transitive matches collapse to one', () => {
    const candidates = [
      makePipeline('a', 'Druzhba', 52.6, 49.4, 52.32, 14.06),
      makePipeline('b', 'Druzhba Pipeline', 52.601, 49.401, 52.321, 14.061),
      makePipeline('c', 'Druzhba Line', 52.602, 49.402, 52.322, 14.062),
    ];
    const { toAdd } = dedupePipelines([], candidates);
    assert.equal(toAdd.length, 1, 'three matching candidates must collapse to the first one accepted');
  });

  test('existing wins over already-accepted candidate', () => {
    // If a candidate matches an existing row, it must be reported as
    // matching the existing row (existing-vs-toAdd precedence). Names
    // chosen so Jaccard exceeds 0.6 after stopword removal.
    const existing = [makePipeline('canon', 'Druzhba Northern', 52.6, 49.4, 52.32, 14.06)];
    const candidates = [
      makePipeline('cand-1', 'Druzhba Northern', 60, 30, 50, 14),  // doesn't match existing (far endpoints)
      makePipeline('cand-2', 'Druzhba Northern', 52.601, 49.401, 52.321, 14.061),  // matches existing (near + Jaccard=1)
    ];
    const { toAdd, skippedDuplicates } = dedupePipelines(existing, candidates);
    assert.equal(toAdd.length, 1, 'cand-1 added; cand-2 skipped against existing');
    assert.equal(skippedDuplicates[0].matchedExistingId, 'canon',
      'cand-2 should be reported as matching the existing canon, not the earlier candidate');
  });
});

describe('pipeline-dedup — empty inputs', () => {
  test('empty existing + N candidates → all N added, none skipped', () => {
    const candidates = [
      makePipeline('a', 'A', 0, 0, 1, 1),
      makePipeline('b', 'B', 5, 5, 6, 6),
    ];
    const { toAdd, skippedDuplicates } = dedupePipelines([], candidates);
    assert.equal(toAdd.length, 2);
    assert.equal(skippedDuplicates.length, 0);
  });

  test('N existing + empty candidates → empty result', () => {
    const existing = [makePipeline('a', 'A', 0, 0, 1, 1)];
    const { toAdd, skippedDuplicates } = dedupePipelines(existing, []);
    assert.equal(toAdd.length, 0);
    assert.equal(skippedDuplicates.length, 0);
  });
});