only-bible-app

GIT_CONFIG_PARAMETERS="'http.version=HTTP/1.1'" git clone https://git.pyrossh.dev/only-bible-app.git
Discussions: https://groups.google.com/g/rust-embed-devs

The only bible app you will ever need. No ads. No in-app purchases. No distractions.

Readme
Commits
Files
scripts/addRedLettersHindi.js

Contents
History
Blame
import fs from "fs";
import path from "path";
import { fileURLToPath } from "url";

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const hiPath = path.join(__dirname, "files", "hi.txt");
const usfmDir = path.join(__dirname, "hin2017_usfm");

// USFM file prefix -> hi.txt book index (0-based)
const BOOKS = [
  { usfmFile: "46-MAThin2017.usfm", bookIdx: 39 },
  { usfmFile: "47-MRKhin2017.usfm", bookIdx: 40 },
  { usfmFile: "48-LUKhin2017.usfm", bookIdx: 41 },
  { usfmFile: "49-JHNhin2017.usfm", bookIdx: 42 },
  { usfmFile: "50-ACThin2017.usfm", bookIdx: 43 },
  { usfmFile: "52-1COhin2017.usfm", bookIdx: 45 },
  { usfmFile: "77-2COhin2017.usfm", bookIdx: 46 },
  { usfmFile: "72-REVhin2017.usfm", bookIdx: 65 },
];

/**
 * Parse a USFM file and extract red letter (words of Jesus) data per verse.
 * Returns Map<"chapter:verse", {segments: [{text, red}], fullyRed: boolean}>
 */
function parseUSFM(filePath) {
  const raw = fs.readFileSync(filePath, "utf-8");
  const lines = raw.split("\n");
  const verseData = new Map();

  let currentChapter = 0;

  for (const line of lines) {
    // Chapter marker
    const chapMatch = line.match(/^\\c\s+(\d+)/);
    if (chapMatch) {
      currentChapter = parseInt(chapMatch[1]);
      continue;
    }

    // Only process verse lines
    const verseMatch = line.match(/^\\v\s+(\d+)\s+(.*)/);
    if (!verseMatch) {
      // Could be continuation of a verse (poetry lines \q1, \q2, etc.)
      // Check if it has \wj content and append to last verse
      if (line.match(/\\wj/) && verseData.size > 0) {
        // This is a continuation line - find the last verse entry for current chapter
        const keys = [...verseData.keys()].filter((k) => k.startsWith(`${currentChapter}:`));
        if (keys.length > 0) {
          const lastKey = keys[keys.length - 1];
          const entry = verseData.get(lastKey);
          const contSegments = extractRedSegments(line);
          entry.segments.push(...contSegments);
          entry.fullyRed = entry.segments.every((s) => s.red || !s.text.trim());
        }
      }
      continue;
    }

    const verseNum = parseInt(verseMatch[1]);
    const verseContent = verseMatch[2];
    const segments = extractRedSegments(verseContent);
    const hasRed = segments.some((s) => s.red && s.text.trim());

    if (hasRed) {
      const fullyRed = segments.every((s) => s.red || !s.text.trim());
      const key = `${currentChapter}:${verseNum}`;

      if (verseData.has(key)) {
        // Append to existing (shouldn't happen but be safe)
        const entry = verseData.get(key);
        entry.segments.push(...segments);
        entry.fullyRed = entry.fullyRed && fullyRed;
      } else {
        verseData.set(key, { segments, fullyRed });
      }
    }
  }

  return verseData;
}

/**
 * Extract red/non-red text segments from a USFM line.
 * Handles \wj...\wj*, \+wj...\+wj*, and strips other USFM markers.
 */
function extractRedSegments(text) {
  // First, strip footnotes \f ... \f*
  text = text.replace(/\\f\s+.*?\\f\*/g, "");
  // Strip cross-references \x ... \x*
  text = text.replace(/\\x\s+.*?\\x\*/g, "");
  // Strip bold-italic markers \bdit ... \bdit*
  text = text.replace(/\\bdit\s+.*?\\bdit\*/g, "");

  const segments = [];
  let pos = 0;
  let inRed = false;

  while (pos < text.length) {
    if (!inRed) {
      // Look for \wj or \+wj
      const wjIdx = text.indexOf("\\wj", pos);
      if (wjIdx === -1) {
        // No more red - rest is non-red
        const remaining = cleanUSFM(text.substring(pos));
        if (remaining) segments.push({ text: remaining, red: false });
        break;
      }
      // Text before \wj is non-red
      const before = cleanUSFM(text.substring(pos, wjIdx));
      if (before) segments.push({ text: before, red: false });
      // Skip past the \wj marker (could be \wj or \+wj followed by space)
      const afterMarker = text.indexOf(" ", wjIdx);
      pos = afterMarker !== -1 && afterMarker - wjIdx < 6 ? afterMarker + 1 : wjIdx + 4;
      inRed = true;
    } else {
      // Look for \wj* or \+wj*
      const endIdx = text.indexOf("\\wj*", pos);
      const endIdx2 = text.indexOf("\\+wj*", pos);
      let actualEnd;

      if (endIdx === -1 && endIdx2 === -1) {
        // No closing - rest is red
        const remaining = cleanUSFM(text.substring(pos));
        if (remaining) segments.push({ text: remaining, red: true });
        break;
      }

      if (endIdx === -1) actualEnd = endIdx2;
      else if (endIdx2 === -1) actualEnd = endIdx;
      else actualEnd = Math.min(endIdx, endIdx2);

      const redText = cleanUSFM(text.substring(pos, actualEnd));
      if (redText) segments.push({ text: redText, red: true });

      // Skip past closing marker
      const isPlus = text[actualEnd + 1] === "+";
      pos = actualEnd + (isPlus ? 5 : 4);

      // Check if there's immediately another \wj right after (continuation)
      const nextWj = text.indexOf("\\wj", pos);
      const nextText = text.substring(pos, nextWj !== -1 ? nextWj : text.length);
      const cleanNext = cleanUSFM(nextText).trim();

      // If the next non-marker content is empty or just whitespace, stay in red mode
      if (nextWj !== -1 && !cleanNext) {
        const afterMarker2 = text.indexOf(" ", nextWj);
        pos = afterMarker2 !== -1 && afterMarker2 - nextWj < 6 ? afterMarker2 + 1 : nextWj + 4;
        // inRed stays true
      } else {
        inRed = false;
      }
    }
  }

  return segments;
}

/**
 * Remove USFM formatting markers from text, keeping only the readable text.
 */
function cleanUSFM(text) {
  return (
    text
      // Remove remaining USFM markers like \it, \it*, \+wj, etc.
      .replace(/\\[a-z+]+\*/g, "")
      .replace(/\\[a-z]+\d?\s?/g, "")
      // Remove pilcrow and extra whitespace
      .replace(/\s+/g, " ")
      .trim()
  );
}

/**
 * Normalize Hindi text for fuzzy comparison.
 * Handles common differences between IRV and traditional Hindi Bible.
 */
function normalize(text) {
  return (
    text
      // Remove quotes, punctuation
      .replace(/[""''"\(\)।;:,!?¶]/g, "")
      // Normalize chandrabindu/anusvara
      .replace(/ँ/g, "ं")
      .replace(/ऊ/g, "ू")
      // Normalize matras
      .replace(/\s+/g, " ")
      .trim()
  );
}

/**
 * Find the best split point in hi.txt verse text to match IRV red/non-red boundary.
 * Returns {prefix, red, suffix} or null if no good match.
 */
function findRedBoundaries(hiText, segments) {
  // Combine consecutive same-type segments
  const combined = [];
  for (const seg of segments) {
    if (combined.length > 0 && combined[combined.length - 1].red === seg.red) {
      combined[combined.length - 1].text += " " + seg.text;
    } else {
      combined.push({ ...seg });
    }
  }

  // Determine pattern type
  const hasLeadingNonRed = combined.length > 0 && !combined[0].red && combined[0].text.trim();
  const hasTrailingNonRed =
    combined.length > 0 && !combined[combined.length - 1].red && combined[combined.length - 1].text.trim();

  if (combined.every((s) => s.red)) {
    // Fully red
    return { type: "full" };
  }

  // For partial verses, try to find the red boundaries in hi.txt text
  // Strategy: use common anchor phrases that appear in both translations

  // Get the non-red text from IRV
  const irvNonRedPrefix = hasLeadingNonRed ? combined[0].text : "";
  const irvNonRedSuffix = hasTrailingNonRed ? combined[combined.length - 1].text : "";

  // Try to find a good split point using common phrases
  // Common patterns: "कहा," / "कहा;" / "बोला," etc. before Jesus speaks
  const hiNorm = normalize(hiText);

  if (hasLeadingNonRed && !hasTrailingNonRed) {
    // Pattern: narrative + red (Jesus speaks to end of verse)
    // Find where Jesus's words start
    const splitIdx = findSplitPoint(hiText, irvNonRedPrefix, "prefix");
    if (splitIdx !== -1) {
      return { type: "prefix", splitIdx };
    }
  } else if (!hasLeadingNonRed && hasTrailingNonRed) {
    // Pattern: red + narrative (Jesus finishes, narrator continues)
    const splitIdx = findSplitPoint(hiText, irvNonRedSuffix, "suffix");
    if (splitIdx !== -1) {
      return { type: "suffix", splitIdx };
    }
  } else if (hasLeadingNonRed && hasTrailingNonRed) {
    // Pattern: narrative + red + narrative
    const prefixIdx = findSplitPoint(hiText, irvNonRedPrefix, "prefix");
    const suffixIdx = findSplitPoint(hiText, irvNonRedSuffix, "suffix");
    if (prefixIdx !== -1 && suffixIdx !== -1 && prefixIdx < suffixIdx) {
      return { type: "both", prefixIdx, suffixIdx };
    }
  }

  return null;
}

/**
 * Find the split point in hi.txt text where red/non-red boundary should be.
 */
function findSplitPoint(hiText, irvNonRedText, position) {
  // Common Hindi speech markers that indicate where Jesus starts speaking
  const speechMarkers = [
    "उत्तर दिया,", "उत्तर दिया;", "उत्तर दिया:",
    "कहा,", "कहा;", "कहा:", "कहा।",
    "बोला,", "बोला;", "बोला:",
    "कहने लगा,", "कहने लगा;",
    "पुकारकर कहा,", "पुकारकर कहा;",
    "डांटकर कहा,", "डांटकर कहा;",
  ];

  if (position === "prefix") {
    // Strategy 1: Try to match IRV prefix text to find the exact boundary
    // Use increasingly shorter suffixes of the IRV non-red text
    const hiNorm = normalize(hiText);
    const irvNorm = normalize(irvNonRedText);

    // Try matching the last N words of the IRV prefix
    const irvWords = irvNorm.split(" ").filter(Boolean);
    for (let n = Math.min(5, irvWords.length); n >= 2; n--) {
      const lastWords = irvWords.slice(-n).join(" ");
      if (lastWords.length > 3) {
        const matchIdx = hiNorm.indexOf(lastWords);
        if (matchIdx !== -1) {
          let pos = findOriginalPos(hiText, hiNorm, matchIdx + lastWords.length);
          // Skip whitespace and punctuation after match
          while (pos < hiText.length && /[\s,;:]/.test(hiText[pos])) pos++;
          return pos;
        }
      }
    }

    // Strategy 2: Find the LAST speech marker in hi.txt (handles multi-speaker verses)
    let bestIdx = -1;
    for (const marker of speechMarkers) {
      let searchFrom = 0;
      let lastFound = -1;
      while (true) {
        const idx = hiText.indexOf(marker, searchFrom);
        if (idx === -1) break;
        lastFound = idx;
        searchFrom = idx + 1;
      }
      if (lastFound !== -1) {
        const splitAt = lastFound + marker.length;
        let actualSplit = splitAt;
        while (actualSplit < hiText.length && hiText[actualSplit] === " ") actualSplit++;
        if (bestIdx === -1 || actualSplit > bestIdx) {
          bestIdx = actualSplit;
        }
      }
    }

    return bestIdx;
  } else {
    // suffix: find where Jesus stops speaking
    // Strategy 1: Match IRV suffix text directly
    const hiNorm = normalize(hiText);
    const irvWords = normalize(irvNonRedText).split(" ").filter(Boolean);

    for (let n = Math.min(5, irvWords.length); n >= 2; n--) {
      const firstWords = irvWords.slice(0, n).join(" ");
      if (firstWords.length > 3) {
        const matchIdx = hiNorm.indexOf(firstWords);
        if (matchIdx !== -1) {
          let pos = findOriginalPos(hiText, hiNorm, matchIdx);
          // Back up past any whitespace
          while (pos > 0 && /\s/.test(hiText[pos - 1])) pos--;
          return pos;
        }
      }
    }

    // Strategy 2: Find speech markers for suffix (narrator continues)
    // Look for common narrative continuation patterns
    const suffixMarkers = [
      " और उसका", " और वह", " तब उसने", " तब उस ने",
      " और उस ने", " फिर उस ने", " सो उस ने",
    ];
    for (const marker of suffixMarkers) {
      const idx = hiText.lastIndexOf(marker);
      if (idx !== -1 && idx > hiText.length * 0.3) {
        return idx;
      }
    }

    return -1;
  }
}

/**
 * Map position from normalized text back to original text position.
 */
function findOriginalPos(original, normalized, normPos) {
  let origIdx = 0;
  let normIdx = 0;

  while (normIdx < normPos && origIdx < original.length) {
    const origChar = original[origIdx];
    // Skip chars that were removed during normalization
    if (/[""''"\(\)।;:,!?¶]/.test(origChar)) {
      origIdx++;
      continue;
    }
    if (/\s/.test(origChar)) {
      // Spaces: advance both if normalized also has space
      if (normIdx < normalized.length && /\s/.test(normalized[normIdx])) {
        normIdx++;
      }
      origIdx++;
      continue;
    }
    origIdx++;
    normIdx++;
  }

  return origIdx;
}

/**
 * Apply red tags to a single hi.txt line based on boundary information.
 */
function applyRedTags(line, boundaries) {
  const parts = line.split("|");
  if (parts.length < 7) return line;

  const prefix = parts.slice(0, 6).join("|");
  let text = parts.slice(6).join("|");

  // Don't double-tag
  if (text.includes("<red>")) return line;

  // Preserve leading ¶ and whitespace
  const match = text.match(/^([¶\s]*)(.*)/);
  const leading = match[1] || "";
  const content = match[2] || "";

  if (!content.trim()) return line;

  if (boundaries.type === "full") {
    return `${prefix}|${leading}<red>${content}</red>`;
  }

  if (boundaries.type === "prefix") {
    const splitIdx = boundaries.splitIdx;
    // The splitIdx is relative to the text content (including leading)
    const fullText = leading + content;
    if (splitIdx >= 0 && splitIdx < fullText.length) {
      const before = fullText.substring(0, splitIdx);
      const after = fullText.substring(splitIdx);
      if (after.trim()) {
        return `${prefix}|${before}<red>${after}</red>`;
      }
    }
  }

  if (boundaries.type === "suffix") {
    const splitIdx = boundaries.splitIdx;
    const fullText = leading + content;
    if (splitIdx > 0 && splitIdx <= fullText.length) {
      const before = fullText.substring(0, splitIdx);
      const after = fullText.substring(splitIdx);
      if (before.trim()) {
        // Find where actual red content starts (after leading)
        return `${prefix}|${leading}<red>${before.substring(leading.length)}</red>${after}`;
      }
    }
  }

  if (boundaries.type === "both") {
    const { prefixIdx, suffixIdx } = boundaries;
    const fullText = leading + content;
    if (prefixIdx >= 0 && suffixIdx > prefixIdx && suffixIdx <= fullText.length) {
      const before = fullText.substring(0, prefixIdx);
      const red = fullText.substring(prefixIdx, suffixIdx);
      const after = fullText.substring(suffixIdx);
      if (red.trim()) {
        return `${prefix}|${before}<red>${red}</red>${after}`;
      }
    }
  }

  return line;
}

// ============= MAIN =============

// Step 1: Parse all USFM files to get red letter data
console.log("Parsing USFM files for red letter data...");
const allRedData = new Map(); // "bookIdx:chapter:verse" -> {segments, fullyRed}

for (const book of BOOKS) {
  const usfmPath = path.join(usfmDir, book.usfmFile);
  if (!fs.existsSync(usfmPath)) {
    console.log(`  Skipping ${book.usfmFile} (not found)`);
    continue;
  }

  const verseData = parseUSFM(usfmPath);
  let fullCount = 0;
  let partialCount = 0;

  for (const [chapVerse, data] of verseData) {
    const [chap, verse] = chapVerse.split(":");
    // USFM chapters are 1-based, hi.txt uses 0-based
    const key = `${book.bookIdx}:${parseInt(chap) - 1}:${parseInt(verse) - 1}`;
    allRedData.set(key, data);
    if (data.fullyRed) fullCount++;
    else partialCount++;
  }

  console.log(`  ${book.usfmFile}: ${fullCount} full, ${partialCount} partial`);
}

console.log(`Total USFM red data: ${allRedData.size} verses`);

// Step 2: Load hi.txt and apply red tags
const hiLines = fs.readFileSync(hiPath, "utf-8").split("\n");
let taggedFull = 0;
let taggedPartial = 0;
let skippedAlready = 0;
let failedPartial = 0;

const newLines = hiLines.map((line) => {
  if (!line) return line;
  const parts = line.split("|");
  if (parts.length < 7) return line;

  const bookIdx = parts[1];
  const chapIdx = parts[2];
  const verseIdx = parts[3];
  const key = `${bookIdx}:${chapIdx}:${verseIdx}`;

  const redData = allRedData.get(key);
  if (!redData) return line;

  const text = parts.slice(6).join("|");
  if (text.includes("<red>")) {
    skippedAlready++;
    return line;
  }

  if (redData.fullyRed) {
    // Tag entire verse
    const match = text.match(/^([¶\s]*)(.*)/);
    const leading = match[1] || "";
    const content = match[2] || "";
    if (content.trim()) {
      taggedFull++;
      const prefix = parts.slice(0, 6).join("|");
      return `${prefix}|${leading}<red>${content}</red>`;
    }
    return line;
  }

  // Partially red verse - try to find boundaries
  // Use the text portion only (after the 6th pipe)
  const fullText = text;
  const boundaries = findRedBoundaries(fullText, redData.segments);

  if (boundaries) {
    if (boundaries.type === "full") {
      // Recategorized as full
      const match = text.match(/^([¶\s]*)(.*)/);
      const leading = match[1] || "";
      const content = match[2] || "";
      if (content.trim()) {
        taggedFull++;
        const prefix = parts.slice(0, 6).join("|");
        return `${prefix}|${leading}<red>${content}</red>`;
      }
    } else {
      const result = applyRedTags(line, boundaries);
      if (result !== line) {
        taggedPartial++;
        return result;
      }
    }
  }

  failedPartial++;
  return line;
});

fs.writeFileSync(hiPath, newLines.join("\n"), "utf-8");

console.log(`\nResults:`);
console.log(`  Tagged full:     ${taggedFull}`);
console.log(`  Tagged partial:  ${taggedPartial}`);
console.log(`  Already tagged:  ${skippedAlready}`);
console.log(`  Failed partial:  ${failedPartial}`);
console.log(`  Total modified:  ${taggedFull + taggedPartial}`);


// Results for hi.txt:

// 1,969 total verses with red letter tags
// 1,405 fully-red verses (from the earlier addRedLetters.js via KJV)
// 20 additional fully-red verses discovered from IRV USFM
// 544 partially-red verses — narrative + Jesus's words accurately split
// 77 partial verses that couldn't be reliably matched (left untagged to avoid errors)
~repos /only-bible-app

GIT_CONFIG_PARAMETERS="'http.version=HTTP/1.1'" git clone https://git.pyrossh.dev/only-bible-app.git Discussions: https://groups.google.com/g/rust-embed-devs

The only bible app you will ever need. No ads. No in-app purchases. No distractions.

scripts/addRedLettersHindi.js

GIT_CONFIG_PARAMETERS="'http.version=HTTP/1.1'" git clone https://git.pyrossh.dev/only-bible-app.git
Discussions: https://groups.google.com/g/rust-embed-devs