import { fileURLToPath } from "url";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const nePath = path.join(__dirname, "files", "ne.txt");
// Reuse Hindi IRV USFM as the source of red letter verse IDs/structure
const usfmDir = path.join(__dirname, "hin2017_usfm");
// USFM file prefix -> ne.txt book index (0-based)
{ usfmFile: "46-MAThin2017.usfm", bookIdx: 39 },
{ usfmFile: "47-MRKhin2017.usfm", bookIdx: 40 },
{ usfmFile: "48-LUKhin2017.usfm", bookIdx: 41 },
{ usfmFile: "49-JHNhin2017.usfm", bookIdx: 42 },
{ usfmFile: "50-ACThin2017.usfm", bookIdx: 43 },
{ usfmFile: "52-1COhin2017.usfm", bookIdx: 45 },
{ usfmFile: "77-2COhin2017.usfm", bookIdx: 46 },
{ usfmFile: "72-REVhin2017.usfm", bookIdx: 65 },
// ===== USFM Parsing (reused from addRedLettersHindi.js) =====
function parseUSFM(filePath) {
const raw = fs.readFileSync(filePath, "utf-8");
const lines = raw.split("\n");
const verseData = new Map();
for (const line of lines) {
const chapMatch = line.match(/^\\c\s+(\d+)/);
currentChapter = parseInt(chapMatch[1]);
const verseMatch = line.match(/^\\v\s+(\d+)\s+(.*)/);
if (line.match(/\\wj/) && verseData.size > 0) {
const keys = [...verseData.keys()].filter((k) => k.startsWith(`${currentChapter}:`));
const lastKey = keys[keys.length - 1];
const entry = verseData.get(lastKey);
const contSegments = extractRedSegments(line);
entry.segments.push(...contSegments);
entry.fullyRed = entry.segments.every((s) => s.red || !s.text.trim());
const verseNum = parseInt(verseMatch[1]);
const verseContent = verseMatch[2];
const segments = extractRedSegments(verseContent);
const hasRed = segments.some((s) => s.red && s.text.trim());
const fullyRed = segments.every((s) => s.red || !s.text.trim());
const key = `${currentChapter}:${verseNum}`;
if (verseData.has(key)) {
const entry = verseData.get(key);
entry.segments.push(...segments);
entry.fullyRed = entry.fullyRed && fullyRed;
verseData.set(key, { segments, fullyRed });
function extractRedSegments(text) {
text = text.replace(/\\f\s+.*?\\f\*/g, "");
text = text.replace(/\\x\s+.*?\\x\*/g, "");
text = text.replace(/\\bdit\s+.*?\\bdit\*/g, "");
while (pos < text.length) {
const wjIdx = text.indexOf("\\wj", pos);
const remaining = cleanUSFM(text.substring(pos));
if (remaining) segments.push({ text: remaining, red: false });
const before = cleanUSFM(text.substring(pos, wjIdx));
if (before) segments.push({ text: before, red: false });
const afterMarker = text.indexOf(" ", wjIdx);
pos = afterMarker !== -1 && afterMarker - wjIdx < 6 ? afterMarker + 1 : wjIdx + 4;
const endIdx = text.indexOf("\\wj*", pos);
const endIdx2 = text.indexOf("\\+wj*", pos);
if (endIdx === -1 && endIdx2 === -1) {
const remaining = cleanUSFM(text.substring(pos));
if (remaining) segments.push({ text: remaining, red: true });
if (endIdx === -1) actualEnd = endIdx2;
else if (endIdx2 === -1) actualEnd = endIdx;
else actualEnd = Math.min(endIdx, endIdx2);
const redText = cleanUSFM(text.substring(pos, actualEnd));
if (redText) segments.push({ text: redText, red: true });
const isPlus = text[actualEnd + 1] === "+";
pos = actualEnd + (isPlus ? 5 : 4);
const nextWj = text.indexOf("\\wj", pos);
const nextText = text.substring(pos, nextWj !== -1 ? nextWj : text.length);
const cleanNext = cleanUSFM(nextText).trim();
if (nextWj !== -1 && !cleanNext) {
const afterMarker2 = text.indexOf(" ", nextWj);
pos = afterMarker2 !== -1 && afterMarker2 - nextWj < 6 ? afterMarker2 + 1 : nextWj + 4;
function cleanUSFM(text) {
.replace(/\\[a-z+]+\*/g, "")
.replace(/\\[a-z]+\d?\s?/g, "")
// ===== Nepali-specific split logic =====
* Nepali speech markers that indicate where Jesus starts/stops speaking.
* Ordered roughly by specificity/frequency.
// "called out/rebuked" etc.
// Curly quote start after speech marker (fallback)
* Find the best split point in Nepali verse text where red/non-red boundary should be.
* Uses the USFM segment structure to know the pattern (prefix/red, red/suffix, both).
function findRedBoundaries(neText, segments) {
// Combine consecutive same-type segments
for (const seg of segments) {
if (combined.length > 0 && combined[combined.length - 1].red === seg.red) {
combined[combined.length - 1].text += " " + seg.text;
combined.push({ ...seg });
const hasLeadingNonRed = combined.length > 0 && !combined[0].red && combined[0].text.trim();
const hasTrailingNonRed =
combined.length > 0 && !combined[combined.length - 1].red && combined[combined.length - 1].text.trim();
if (combined.every((s) => s.red)) {
if (hasLeadingNonRed && !hasTrailingNonRed) {
// Pattern: narrative + red (Jesus speaks to end of verse)
const splitIdx = findSplitPoint(neText, "prefix");
return { type: "prefix", splitIdx };
} else if (!hasLeadingNonRed && hasTrailingNonRed) {
// Pattern: red + narrative (Jesus finishes, narrator continues)
const splitIdx = findSplitPoint(neText, "suffix");
return { type: "suffix", splitIdx };
} else if (hasLeadingNonRed && hasTrailingNonRed) {
// Pattern: narrative + red + narrative
const prefixIdx = findSplitPoint(neText, "prefix");
const suffixIdx = findSplitPoint(neText, "suffix");
if (prefixIdx !== -1 && suffixIdx !== -1 && prefixIdx < suffixIdx) {
return { type: "both", prefixIdx, suffixIdx };
* Find the split point in Nepali text.
* For "prefix": find where Jesus's speech starts (after speech marker + open quote).
* For "suffix": find where Jesus's speech ends (closing quote before narrator resumes).
function findSplitPoint(neText, position) {
if (position === "prefix") {
// Strategy 1: Find the LAST speech marker in the verse
// (handles multi-speaker verses where Jesus speaks last)
for (const marker of SPEECH_MARKERS) {
const idx = neText.indexOf(marker, searchFrom);
let splitAt = lastFound + marker.length;
// Skip whitespace after marker
while (splitAt < neText.length && neText[splitAt] === " ") splitAt++;
// Skip opening quote mark if present
if (splitAt < neText.length && (neText[splitAt] === "\u201C" || neText[splitAt] === "'")) {
if (bestIdx === -1 || splitAt > bestIdx) {
if (bestIdx !== -1) return bestIdx;
// Strategy 2: Find the last opening curly quote preceded by comma/space
// This catches patterns like: ...येशूले तिनीहरूलाई भने, "...
const quotePattern = /[,;:]\s*\u201C/g;
while ((m = quotePattern.exec(neText)) !== null) {
let splitAt = lastMatch.index + lastMatch[0].length;
// suffix: find where Jesus stops speaking
// Strategy 1: Find the LAST closing curly quote followed by narrative text
// Look for closing quote \u201D followed by non-quote text
const lastCloseQuote = neText.lastIndexOf("\u201D");
if (lastCloseQuote !== -1 && lastCloseQuote < neText.length - 2) {
// There's text after the closing quote - that's the narrator's part
return lastCloseQuote + 1;
// Strategy 2: Look for closing single quote
const lastSingleClose = neText.lastIndexOf("'");
if (lastSingleClose !== -1 && lastSingleClose < neText.length - 2) {
return lastSingleClose + 1;
* Apply red tags to a single ne.txt line based on boundary information.
function applyRedTags(line, boundaries) {
const parts = line.split("|");
if (parts.length < 7) return line;
const prefix = parts.slice(0, 6).join("|");
let text = parts.slice(6).join("|");
if (text.includes("<red>")) return line;
// Preserve leading ¶ and whitespace
const match = text.match(/^([¶\s]*)(.*)/);
const leading = match[1] || "";
const content = match[2] || "";
if (!content.trim()) return line;
if (boundaries.type === "full") {
return `${prefix}|${leading}<red>${content}</red>`;
const fullText = leading + content;
if (boundaries.type === "prefix") {
const splitIdx = boundaries.splitIdx;
if (splitIdx >= 0 && splitIdx < fullText.length) {
const before = fullText.substring(0, splitIdx);
const after = fullText.substring(splitIdx);
return `${prefix}|${before}<red>${after}</red>`;
if (boundaries.type === "suffix") {
const splitIdx = boundaries.splitIdx;
if (splitIdx > 0 && splitIdx <= fullText.length) {
const before = fullText.substring(0, splitIdx);
const after = fullText.substring(splitIdx);
return `${prefix}|${leading}<red>${before.substring(leading.length)}</red>${after}`;
if (boundaries.type === "both") {
const { prefixIdx, suffixIdx } = boundaries;
if (prefixIdx >= 0 && suffixIdx > prefixIdx && suffixIdx <= fullText.length) {
const before = fullText.substring(0, prefixIdx);
const red = fullText.substring(prefixIdx, suffixIdx);
const after = fullText.substring(suffixIdx);
return `${prefix}|${before}<red>${red}</red>${after}`;
// ============= MAIN =============
// Step 1: Parse Hindi IRV USFM files to get red letter structure
console.log("Parsing Hindi IRV USFM files for red letter verse IDs/structure...");
const allRedData = new Map(); // "bookIdx:chapter:verse" -> {segments, fullyRed}
for (const book of BOOKS) {
const usfmPath = path.join(usfmDir, book.usfmFile);
if (!fs.existsSync(usfmPath)) {
console.log(` Skipping ${book.usfmFile} (not found)`);
const verseData = parseUSFM(usfmPath);
for (const [chapVerse, data] of verseData) {
const [chap, verse] = chapVerse.split(":");
// USFM chapters are 1-based, ne.txt uses 0-based
const key = `${book.bookIdx}:${parseInt(chap) - 1}:${parseInt(verse) - 1}`;
allRedData.set(key, data);
if (data.fullyRed) fullCount++;
console.log(` ${book.usfmFile}: ${fullCount} full, ${partialCount} partial`);
console.log(`Total USFM red data: ${allRedData.size} verses`);
// Step 2: Load ne.txt and apply red tags
const neLines = fs.readFileSync(nePath, "utf-8").split("\n");
const newLines = neLines.map((line) => {
const parts = line.split("|");
if (parts.length < 7) return line;
const bookIdx = parts[1];
const chapIdx = parts[2];
const verseIdx = parts[3];
const key = `${bookIdx}:${chapIdx}:${verseIdx}`;
const redData = allRedData.get(key);
if (!redData) return line;
const text = parts.slice(6).join("|");
if (text.includes("<red>")) {
const match = text.match(/^([¶\s]*)(.*)/);
const leading = match[1] || "";
const content = match[2] || "";
const prefix = parts.slice(0, 6).join("|");
return `${prefix}|${leading}<red>${content}</red>`;
// Partially red verse - try to find boundaries using Nepali speech markers
const boundaries = findRedBoundaries(text, redData.segments);
if (boundaries.type === "full") {
const match = text.match(/^([¶\s]*)(.*)/);
const leading = match[1] || "";
const content = match[2] || "";
const prefix = parts.slice(0, 6).join("|");
return `${prefix}|${leading}<red>${content}</red>`;
const result = applyRedTags(line, boundaries);
fs.writeFileSync(nePath, newLines.join("\n"), "utf-8");
console.log(`\nResults:`);
console.log(` Tagged full: ${taggedFull}`);
console.log(` Tagged partial: ${taggedPartial}`);
console.log(` Already tagged: ${skippedAlready}`);
console.log(` Failed partial: ${failedPartial}`);
console.log(` Total modified: ${taggedFull + taggedPartial}`);