import { fileURLToPath } from "url";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const hiPath = path.join(__dirname, "files", "hi.txt");
const usfmDir = path.join(__dirname, "hin2017_usfm");
// USFM file prefix -> hi.txt book index (0-based)
{ usfmFile: "46-MAThin2017.usfm", bookIdx: 39 },
{ usfmFile: "47-MRKhin2017.usfm", bookIdx: 40 },
{ usfmFile: "48-LUKhin2017.usfm", bookIdx: 41 },
{ usfmFile: "49-JHNhin2017.usfm", bookIdx: 42 },
{ usfmFile: "50-ACThin2017.usfm", bookIdx: 43 },
{ usfmFile: "52-1COhin2017.usfm", bookIdx: 45 },
{ usfmFile: "77-2COhin2017.usfm", bookIdx: 46 },
{ usfmFile: "72-REVhin2017.usfm", bookIdx: 65 },
* Parse a USFM file and extract red letter (words of Jesus) data per verse.
* Returns Map<"chapter:verse", {segments: [{text, red}], fullyRed: boolean}>
function parseUSFM(filePath) {
const raw = fs.readFileSync(filePath, "utf-8");
const lines = raw.split("\n");
const verseData = new Map();
for (const line of lines) {
const chapMatch = line.match(/^\\c\s+(\d+)/);
currentChapter = parseInt(chapMatch[1]);
// Only process verse lines
const verseMatch = line.match(/^\\v\s+(\d+)\s+(.*)/);
// Could be continuation of a verse (poetry lines \q1, \q2, etc.)
// Check if it has \wj content and append to last verse
if (line.match(/\\wj/) && verseData.size > 0) {
// This is a continuation line - find the last verse entry for current chapter
const keys = [...verseData.keys()].filter((k) => k.startsWith(`${currentChapter}:`));
const lastKey = keys[keys.length - 1];
const entry = verseData.get(lastKey);
const contSegments = extractRedSegments(line);
entry.segments.push(...contSegments);
entry.fullyRed = entry.segments.every((s) => s.red || !s.text.trim());
const verseNum = parseInt(verseMatch[1]);
const verseContent = verseMatch[2];
const segments = extractRedSegments(verseContent);
const hasRed = segments.some((s) => s.red && s.text.trim());
const fullyRed = segments.every((s) => s.red || !s.text.trim());
const key = `${currentChapter}:${verseNum}`;
if (verseData.has(key)) {
// Append to existing (shouldn't happen but be safe)
const entry = verseData.get(key);
entry.segments.push(...segments);
entry.fullyRed = entry.fullyRed && fullyRed;
verseData.set(key, { segments, fullyRed });
* Extract red/non-red text segments from a USFM line.
* Handles \wj...\wj*, \+wj...\+wj*, and strips other USFM markers.
function extractRedSegments(text) {
// First, strip footnotes \f ... \f*
text = text.replace(/\\f\s+.*?\\f\*/g, "");
// Strip cross-references \x ... \x*
text = text.replace(/\\x\s+.*?\\x\*/g, "");
// Strip bold-italic markers \bdit ... \bdit*
text = text.replace(/\\bdit\s+.*?\\bdit\*/g, "");
while (pos < text.length) {
const wjIdx = text.indexOf("\\wj", pos);
// No more red - rest is non-red
const remaining = cleanUSFM(text.substring(pos));
if (remaining) segments.push({ text: remaining, red: false });
// Text before \wj is non-red
const before = cleanUSFM(text.substring(pos, wjIdx));
if (before) segments.push({ text: before, red: false });
// Skip past the \wj marker (could be \wj or \+wj followed by space)
const afterMarker = text.indexOf(" ", wjIdx);
pos = afterMarker !== -1 && afterMarker - wjIdx < 6 ? afterMarker + 1 : wjIdx + 4;
// Look for \wj* or \+wj*
const endIdx = text.indexOf("\\wj*", pos);
const endIdx2 = text.indexOf("\\+wj*", pos);
if (endIdx === -1 && endIdx2 === -1) {
// No closing - rest is red
const remaining = cleanUSFM(text.substring(pos));
if (remaining) segments.push({ text: remaining, red: true });
if (endIdx === -1) actualEnd = endIdx2;
else if (endIdx2 === -1) actualEnd = endIdx;
else actualEnd = Math.min(endIdx, endIdx2);
const redText = cleanUSFM(text.substring(pos, actualEnd));
if (redText) segments.push({ text: redText, red: true });
// Skip past closing marker
const isPlus = text[actualEnd + 1] === "+";
pos = actualEnd + (isPlus ? 5 : 4);
// Check if there's immediately another \wj right after (continuation)
const nextWj = text.indexOf("\\wj", pos);
const nextText = text.substring(pos, nextWj !== -1 ? nextWj : text.length);
const cleanNext = cleanUSFM(nextText).trim();
// If the next non-marker content is empty or just whitespace, stay in red mode
if (nextWj !== -1 && !cleanNext) {
const afterMarker2 = text.indexOf(" ", nextWj);
pos = afterMarker2 !== -1 && afterMarker2 - nextWj < 6 ? afterMarker2 + 1 : nextWj + 4;
* Remove USFM formatting markers from text, keeping only the readable text.
function cleanUSFM(text) {
// Remove remaining USFM markers like \it, \it*, \+wj, etc.
.replace(/\\[a-z+]+\*/g, "")
.replace(/\\[a-z]+\d?\s?/g, "")
// Remove pilcrow and extra whitespace
* Normalize Hindi text for fuzzy comparison.
* Handles common differences between IRV and traditional Hindi Bible.
function normalize(text) {
// Remove quotes, punctuation
.replace(/[""''"\(\)।;:,!?¶]/g, "")
// Normalize chandrabindu/anusvara
* Find the best split point in hi.txt verse text to match IRV red/non-red boundary.
* Returns {prefix, red, suffix} or null if no good match.
function findRedBoundaries(hiText, segments) {
// Combine consecutive same-type segments
for (const seg of segments) {
if (combined.length > 0 && combined[combined.length - 1].red === seg.red) {
combined[combined.length - 1].text += " " + seg.text;
combined.push({ ...seg });
// Determine pattern type
const hasLeadingNonRed = combined.length > 0 && !combined[0].red && combined[0].text.trim();
const hasTrailingNonRed =
combined.length > 0 && !combined[combined.length - 1].red && combined[combined.length - 1].text.trim();
if (combined.every((s) => s.red)) {
// For partial verses, try to find the red boundaries in hi.txt text
// Strategy: use common anchor phrases that appear in both translations
// Get the non-red text from IRV
const irvNonRedPrefix = hasLeadingNonRed ? combined[0].text : "";
const irvNonRedSuffix = hasTrailingNonRed ? combined[combined.length - 1].text : "";
// Try to find a good split point using common phrases
// Common patterns: "कहा," / "कहा;" / "बोला," etc. before Jesus speaks
const hiNorm = normalize(hiText);
if (hasLeadingNonRed && !hasTrailingNonRed) {
// Pattern: narrative + red (Jesus speaks to end of verse)
// Find where Jesus's words start
const splitIdx = findSplitPoint(hiText, irvNonRedPrefix, "prefix");
return { type: "prefix", splitIdx };
} else if (!hasLeadingNonRed && hasTrailingNonRed) {
// Pattern: red + narrative (Jesus finishes, narrator continues)
const splitIdx = findSplitPoint(hiText, irvNonRedSuffix, "suffix");
return { type: "suffix", splitIdx };
} else if (hasLeadingNonRed && hasTrailingNonRed) {
// Pattern: narrative + red + narrative
const prefixIdx = findSplitPoint(hiText, irvNonRedPrefix, "prefix");
const suffixIdx = findSplitPoint(hiText, irvNonRedSuffix, "suffix");
if (prefixIdx !== -1 && suffixIdx !== -1 && prefixIdx < suffixIdx) {
return { type: "both", prefixIdx, suffixIdx };
* Find the split point in hi.txt text where red/non-red boundary should be.
function findSplitPoint(hiText, irvNonRedText, position) {
// Common Hindi speech markers that indicate where Jesus starts speaking
"उत्तर दिया,", "उत्तर दिया;", "उत्तर दिया:",
"कहा,", "कहा;", "कहा:", "कहा।",
"बोला,", "बोला;", "बोला:",
"कहने लगा,", "कहने लगा;",
"पुकारकर कहा,", "पुकारकर कहा;",
"डांटकर कहा,", "डांटकर कहा;",
if (position === "prefix") {
// Strategy 1: Try to match IRV prefix text to find the exact boundary
// Use increasingly shorter suffixes of the IRV non-red text
const hiNorm = normalize(hiText);
const irvNorm = normalize(irvNonRedText);
// Try matching the last N words of the IRV prefix
const irvWords = irvNorm.split(" ").filter(Boolean);
for (let n = Math.min(5, irvWords.length); n >= 2; n--) {
const lastWords = irvWords.slice(-n).join(" ");
if (lastWords.length > 3) {
const matchIdx = hiNorm.indexOf(lastWords);
let pos = findOriginalPos(hiText, hiNorm, matchIdx + lastWords.length);
// Skip whitespace and punctuation after match
while (pos < hiText.length && /[\s,;:]/.test(hiText[pos])) pos++;
// Strategy 2: Find the LAST speech marker in hi.txt (handles multi-speaker verses)
for (const marker of speechMarkers) {
const idx = hiText.indexOf(marker, searchFrom);
const splitAt = lastFound + marker.length;
let actualSplit = splitAt;
while (actualSplit < hiText.length && hiText[actualSplit] === " ") actualSplit++;
if (bestIdx === -1 || actualSplit > bestIdx) {
// suffix: find where Jesus stops speaking
// Strategy 1: Match IRV suffix text directly
const hiNorm = normalize(hiText);
const irvWords = normalize(irvNonRedText).split(" ").filter(Boolean);
for (let n = Math.min(5, irvWords.length); n >= 2; n--) {
const firstWords = irvWords.slice(0, n).join(" ");
if (firstWords.length > 3) {
const matchIdx = hiNorm.indexOf(firstWords);
let pos = findOriginalPos(hiText, hiNorm, matchIdx);
// Back up past any whitespace
while (pos > 0 && /\s/.test(hiText[pos - 1])) pos--;
// Strategy 2: Find speech markers for suffix (narrator continues)
// Look for common narrative continuation patterns
" और उसका", " और वह", " तब उसने", " तब उस ने",
" और उस ने", " फिर उस ने", " सो उस ने",
for (const marker of suffixMarkers) {
const idx = hiText.lastIndexOf(marker);
if (idx !== -1 && idx > hiText.length * 0.3) {
* Map position from normalized text back to original text position.
function findOriginalPos(original, normalized, normPos) {
while (normIdx < normPos && origIdx < original.length) {
const origChar = original[origIdx];
// Skip chars that were removed during normalization
if (/[""''"\(\)।;:,!?¶]/.test(origChar)) {
if (/\s/.test(origChar)) {
// Spaces: advance both if normalized also has space
if (normIdx < normalized.length && /\s/.test(normalized[normIdx])) {
* Apply red tags to a single hi.txt line based on boundary information.
function applyRedTags(line, boundaries) {
const parts = line.split("|");
if (parts.length < 7) return line;
const prefix = parts.slice(0, 6).join("|");
let text = parts.slice(6).join("|");
if (text.includes("<red>")) return line;
// Preserve leading ¶ and whitespace
const match = text.match(/^([¶\s]*)(.*)/);
const leading = match[1] || "";
const content = match[2] || "";
if (!content.trim()) return line;
if (boundaries.type === "full") {
return `${prefix}|${leading}<red>${content}</red>`;
if (boundaries.type === "prefix") {
const splitIdx = boundaries.splitIdx;
// The splitIdx is relative to the text content (including leading)
const fullText = leading + content;
if (splitIdx >= 0 && splitIdx < fullText.length) {
const before = fullText.substring(0, splitIdx);
const after = fullText.substring(splitIdx);
return `${prefix}|${before}<red>${after}</red>`;
if (boundaries.type === "suffix") {
const splitIdx = boundaries.splitIdx;
const fullText = leading + content;
if (splitIdx > 0 && splitIdx <= fullText.length) {
const before = fullText.substring(0, splitIdx);
const after = fullText.substring(splitIdx);
// Find where actual red content starts (after leading)
return `${prefix}|${leading}<red>${before.substring(leading.length)}</red>${after}`;
if (boundaries.type === "both") {
const { prefixIdx, suffixIdx } = boundaries;
const fullText = leading + content;
if (prefixIdx >= 0 && suffixIdx > prefixIdx && suffixIdx <= fullText.length) {
const before = fullText.substring(0, prefixIdx);
const red = fullText.substring(prefixIdx, suffixIdx);
const after = fullText.substring(suffixIdx);
return `${prefix}|${before}<red>${red}</red>${after}`;
// ============= MAIN =============
// Step 1: Parse all USFM files to get red letter data
console.log("Parsing USFM files for red letter data...");
const allRedData = new Map(); // "bookIdx:chapter:verse" -> {segments, fullyRed}
for (const book of BOOKS) {
const usfmPath = path.join(usfmDir, book.usfmFile);
if (!fs.existsSync(usfmPath)) {
console.log(` Skipping ${book.usfmFile} (not found)`);
const verseData = parseUSFM(usfmPath);
for (const [chapVerse, data] of verseData) {
const [chap, verse] = chapVerse.split(":");
// USFM chapters are 1-based, hi.txt uses 0-based
const key = `${book.bookIdx}:${parseInt(chap) - 1}:${parseInt(verse) - 1}`;
allRedData.set(key, data);
if (data.fullyRed) fullCount++;
console.log(` ${book.usfmFile}: ${fullCount} full, ${partialCount} partial`);
console.log(`Total USFM red data: ${allRedData.size} verses`);
// Step 2: Load hi.txt and apply red tags
const hiLines = fs.readFileSync(hiPath, "utf-8").split("\n");
const newLines = hiLines.map((line) => {
const parts = line.split("|");
if (parts.length < 7) return line;
const bookIdx = parts[1];
const chapIdx = parts[2];
const verseIdx = parts[3];
const key = `${bookIdx}:${chapIdx}:${verseIdx}`;
const redData = allRedData.get(key);
if (!redData) return line;
const text = parts.slice(6).join("|");
if (text.includes("<red>")) {
const match = text.match(/^([¶\s]*)(.*)/);
const leading = match[1] || "";
const content = match[2] || "";
const prefix = parts.slice(0, 6).join("|");
return `${prefix}|${leading}<red>${content}</red>`;
// Partially red verse - try to find boundaries
// Use the text portion only (after the 6th pipe)
const boundaries = findRedBoundaries(fullText, redData.segments);
if (boundaries.type === "full") {
const match = text.match(/^([¶\s]*)(.*)/);
const leading = match[1] || "";
const content = match[2] || "";
const prefix = parts.slice(0, 6).join("|");
return `${prefix}|${leading}<red>${content}</red>`;
const result = applyRedTags(line, boundaries);
fs.writeFileSync(hiPath, newLines.join("\n"), "utf-8");
console.log(`\nResults:`);
console.log(` Tagged full: ${taggedFull}`);
console.log(` Tagged partial: ${taggedPartial}`);
console.log(` Already tagged: ${skippedAlready}`);
console.log(` Failed partial: ${failedPartial}`);
console.log(` Total modified: ${taggedFull + taggedPartial}`);
// 1,969 total verses with red letter tags
// 1,405 fully-red verses (from the earlier addRedLetters.js via KJV)
// 20 additional fully-red verses discovered from IRV USFM
// 544 partially-red verses — narrative + Jesus's words accurately split
// 77 partial verses that couldn't be reliably matched (left untagged to avoid errors)