~repos /only-bible-app

#kotlin#android#ios

GIT_CONFIG_PARAMETERS="'http.version=HTTP/1.1'" git clone https://git.pyrossh.dev/only-bible-app.git
Discussions: https://groups.google.com/g/rust-embed-devs

The only bible app you will ever need. No ads. No in-app purchases. No distractions.



scripts/addRedLettersHindi.js



import fs from "fs";
import path from "path";
import { fileURLToPath } from "url";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const hiPath = path.join(__dirname, "files", "hi.txt");
const usfmDir = path.join(__dirname, "hin2017_usfm");
// USFM file prefix -> hi.txt book index (0-based)
const BOOKS = [
{ usfmFile: "46-MAThin2017.usfm", bookIdx: 39 },
{ usfmFile: "47-MRKhin2017.usfm", bookIdx: 40 },
{ usfmFile: "48-LUKhin2017.usfm", bookIdx: 41 },
{ usfmFile: "49-JHNhin2017.usfm", bookIdx: 42 },
{ usfmFile: "50-ACThin2017.usfm", bookIdx: 43 },
{ usfmFile: "52-1COhin2017.usfm", bookIdx: 45 },
{ usfmFile: "77-2COhin2017.usfm", bookIdx: 46 },
{ usfmFile: "72-REVhin2017.usfm", bookIdx: 65 },
];
/**
* Parse a USFM file and extract red letter (words of Jesus) data per verse.
* Returns Map<"chapter:verse", {segments: [{text, red}], fullyRed: boolean}>
*/
function parseUSFM(filePath) {
const raw = fs.readFileSync(filePath, "utf-8");
const lines = raw.split("\n");
const verseData = new Map();
let currentChapter = 0;
for (const line of lines) {
// Chapter marker
const chapMatch = line.match(/^\\c\s+(\d+)/);
if (chapMatch) {
currentChapter = parseInt(chapMatch[1]);
continue;
}
// Only process verse lines
const verseMatch = line.match(/^\\v\s+(\d+)\s+(.*)/);
if (!verseMatch) {
// Could be continuation of a verse (poetry lines \q1, \q2, etc.)
// Check if it has \wj content and append to last verse
if (line.match(/\\wj/) && verseData.size > 0) {
// This is a continuation line - find the last verse entry for current chapter
const keys = [...verseData.keys()].filter((k) => k.startsWith(`${currentChapter}:`));
if (keys.length > 0) {
const lastKey = keys[keys.length - 1];
const entry = verseData.get(lastKey);
const contSegments = extractRedSegments(line);
entry.segments.push(...contSegments);
entry.fullyRed = entry.segments.every((s) => s.red || !s.text.trim());
}
}
continue;
}
const verseNum = parseInt(verseMatch[1]);
const verseContent = verseMatch[2];
const segments = extractRedSegments(verseContent);
const hasRed = segments.some((s) => s.red && s.text.trim());
if (hasRed) {
const fullyRed = segments.every((s) => s.red || !s.text.trim());
const key = `${currentChapter}:${verseNum}`;
if (verseData.has(key)) {
// Append to existing (shouldn't happen but be safe)
const entry = verseData.get(key);
entry.segments.push(...segments);
entry.fullyRed = entry.fullyRed && fullyRed;
} else {
verseData.set(key, { segments, fullyRed });
}
}
}
return verseData;
}
/**
* Extract red/non-red text segments from a USFM line.
* Handles \wj...\wj*, \+wj...\+wj*, and strips other USFM markers.
*/
function extractRedSegments(text) {
// First, strip footnotes \f ... \f*
text = text.replace(/\\f\s+.*?\\f\*/g, "");
// Strip cross-references \x ... \x*
text = text.replace(/\\x\s+.*?\\x\*/g, "");
// Strip bold-italic markers \bdit ... \bdit*
text = text.replace(/\\bdit\s+.*?\\bdit\*/g, "");
const segments = [];
let pos = 0;
let inRed = false;
while (pos < text.length) {
if (!inRed) {
// Look for \wj or \+wj
const wjIdx = text.indexOf("\\wj", pos);
if (wjIdx === -1) {
// No more red - rest is non-red
const remaining = cleanUSFM(text.substring(pos));
if (remaining) segments.push({ text: remaining, red: false });
break;
}
// Text before \wj is non-red
const before = cleanUSFM(text.substring(pos, wjIdx));
if (before) segments.push({ text: before, red: false });
// Skip past the \wj marker (could be \wj or \+wj followed by space)
const afterMarker = text.indexOf(" ", wjIdx);
pos = afterMarker !== -1 && afterMarker - wjIdx < 6 ? afterMarker + 1 : wjIdx + 4;
inRed = true;
} else {
// Look for \wj* or \+wj*
const endIdx = text.indexOf("\\wj*", pos);
const endIdx2 = text.indexOf("\\+wj*", pos);
let actualEnd;
if (endIdx === -1 && endIdx2 === -1) {
// No closing - rest is red
const remaining = cleanUSFM(text.substring(pos));
if (remaining) segments.push({ text: remaining, red: true });
break;
}
if (endIdx === -1) actualEnd = endIdx2;
else if (endIdx2 === -1) actualEnd = endIdx;
else actualEnd = Math.min(endIdx, endIdx2);
const redText = cleanUSFM(text.substring(pos, actualEnd));
if (redText) segments.push({ text: redText, red: true });
// Skip past closing marker
const isPlus = text[actualEnd + 1] === "+";
pos = actualEnd + (isPlus ? 5 : 4);
// Check if there's immediately another \wj right after (continuation)
const nextWj = text.indexOf("\\wj", pos);
const nextText = text.substring(pos, nextWj !== -1 ? nextWj : text.length);
const cleanNext = cleanUSFM(nextText).trim();
// If the next non-marker content is empty or just whitespace, stay in red mode
if (nextWj !== -1 && !cleanNext) {
const afterMarker2 = text.indexOf(" ", nextWj);
pos = afterMarker2 !== -1 && afterMarker2 - nextWj < 6 ? afterMarker2 + 1 : nextWj + 4;
// inRed stays true
} else {
inRed = false;
}
}
}
return segments;
}
/**
* Remove USFM formatting markers from text, keeping only the readable text.
*/
function cleanUSFM(text) {
return (
text
// Remove remaining USFM markers like \it, \it*, \+wj, etc.
.replace(/\\[a-z+]+\*/g, "")
.replace(/\\[a-z]+\d?\s?/g, "")
// Remove pilcrow and extra whitespace
.replace(/\s+/g, " ")
.trim()
);
}
/**
* Normalize Hindi text for fuzzy comparison.
* Handles common differences between IRV and traditional Hindi Bible.
*/
function normalize(text) {
return (
text
// Remove quotes, punctuation
.replace(/[""''"\(\)।;:,!?¶]/g, "")
// Normalize chandrabindu/anusvara
.replace(//g, "ं")
.replace(//g, "ू")
// Normalize matras
.replace(/\s+/g, " ")
.trim()
);
}
/**
* Find the best split point in hi.txt verse text to match IRV red/non-red boundary.
* Returns {prefix, red, suffix} or null if no good match.
*/
function findRedBoundaries(hiText, segments) {
// Combine consecutive same-type segments
const combined = [];
for (const seg of segments) {
if (combined.length > 0 && combined[combined.length - 1].red === seg.red) {
combined[combined.length - 1].text += " " + seg.text;
} else {
combined.push({ ...seg });
}
}
// Determine pattern type
const hasLeadingNonRed = combined.length > 0 && !combined[0].red && combined[0].text.trim();
const hasTrailingNonRed =
combined.length > 0 && !combined[combined.length - 1].red && combined[combined.length - 1].text.trim();
if (combined.every((s) => s.red)) {
// Fully red
return { type: "full" };
}
// For partial verses, try to find the red boundaries in hi.txt text
// Strategy: use common anchor phrases that appear in both translations
// Get the non-red text from IRV
const irvNonRedPrefix = hasLeadingNonRed ? combined[0].text : "";
const irvNonRedSuffix = hasTrailingNonRed ? combined[combined.length - 1].text : "";
// Try to find a good split point using common phrases
// Common patterns: "कहा," / "कहा;" / "बोला," etc. before Jesus speaks
const hiNorm = normalize(hiText);
if (hasLeadingNonRed && !hasTrailingNonRed) {
// Pattern: narrative + red (Jesus speaks to end of verse)
// Find where Jesus's words start
const splitIdx = findSplitPoint(hiText, irvNonRedPrefix, "prefix");
if (splitIdx !== -1) {
return { type: "prefix", splitIdx };
}
} else if (!hasLeadingNonRed && hasTrailingNonRed) {
// Pattern: red + narrative (Jesus finishes, narrator continues)
const splitIdx = findSplitPoint(hiText, irvNonRedSuffix, "suffix");
if (splitIdx !== -1) {
return { type: "suffix", splitIdx };
}
} else if (hasLeadingNonRed && hasTrailingNonRed) {
// Pattern: narrative + red + narrative
const prefixIdx = findSplitPoint(hiText, irvNonRedPrefix, "prefix");
const suffixIdx = findSplitPoint(hiText, irvNonRedSuffix, "suffix");
if (prefixIdx !== -1 && suffixIdx !== -1 && prefixIdx < suffixIdx) {
return { type: "both", prefixIdx, suffixIdx };
}
}
return null;
}
/**
* Find the split point in hi.txt text where red/non-red boundary should be.
*/
function findSplitPoint(hiText, irvNonRedText, position) {
// Common Hindi speech markers that indicate where Jesus starts speaking
const speechMarkers = [
"उत्तर दिया,", "उत्तर दिया;", "उत्तर दिया:",
"कहा,", "कहा;", "कहा:", "कहा।",
"बोला,", "बोला;", "बोला:",
"कहने लगा,", "कहने लगा;",
"पुकारकर कहा,", "पुकारकर कहा;",
"डांटकर कहा,", "डांटकर कहा;",
];
if (position === "prefix") {
// Strategy 1: Try to match IRV prefix text to find the exact boundary
// Use increasingly shorter suffixes of the IRV non-red text
const hiNorm = normalize(hiText);
const irvNorm = normalize(irvNonRedText);
// Try matching the last N words of the IRV prefix
const irvWords = irvNorm.split(" ").filter(Boolean);
for (let n = Math.min(5, irvWords.length); n >= 2; n--) {
const lastWords = irvWords.slice(-n).join(" ");
if (lastWords.length > 3) {
const matchIdx = hiNorm.indexOf(lastWords);
if (matchIdx !== -1) {
let pos = findOriginalPos(hiText, hiNorm, matchIdx + lastWords.length);
// Skip whitespace and punctuation after match
while (pos < hiText.length && /[\s,;:]/.test(hiText[pos])) pos++;
return pos;
}
}
}
// Strategy 2: Find the LAST speech marker in hi.txt (handles multi-speaker verses)
let bestIdx = -1;
for (const marker of speechMarkers) {
let searchFrom = 0;
let lastFound = -1;
while (true) {
const idx = hiText.indexOf(marker, searchFrom);
if (idx === -1) break;
lastFound = idx;
searchFrom = idx + 1;
}
if (lastFound !== -1) {
const splitAt = lastFound + marker.length;
let actualSplit = splitAt;
while (actualSplit < hiText.length && hiText[actualSplit] === " ") actualSplit++;
if (bestIdx === -1 || actualSplit > bestIdx) {
bestIdx = actualSplit;
}
}
}
return bestIdx;
} else {
// suffix: find where Jesus stops speaking
// Strategy 1: Match IRV suffix text directly
const hiNorm = normalize(hiText);
const irvWords = normalize(irvNonRedText).split(" ").filter(Boolean);
for (let n = Math.min(5, irvWords.length); n >= 2; n--) {
const firstWords = irvWords.slice(0, n).join(" ");
if (firstWords.length > 3) {
const matchIdx = hiNorm.indexOf(firstWords);
if (matchIdx !== -1) {
let pos = findOriginalPos(hiText, hiNorm, matchIdx);
// Back up past any whitespace
while (pos > 0 && /\s/.test(hiText[pos - 1])) pos--;
return pos;
}
}
}
// Strategy 2: Find speech markers for suffix (narrator continues)
// Look for common narrative continuation patterns
const suffixMarkers = [
" और उसका", " और वह", " तब उसने", " तब उस ने",
" और उस ने", " फिर उस ने", " सो उस ने",
];
for (const marker of suffixMarkers) {
const idx = hiText.lastIndexOf(marker);
if (idx !== -1 && idx > hiText.length * 0.3) {
return idx;
}
}
return -1;
}
}
/**
* Map position from normalized text back to original text position.
*/
function findOriginalPos(original, normalized, normPos) {
let origIdx = 0;
let normIdx = 0;
while (normIdx < normPos && origIdx < original.length) {
const origChar = original[origIdx];
// Skip chars that were removed during normalization
if (/[""''"\(\)।;:,!?¶]/.test(origChar)) {
origIdx++;
continue;
}
if (/\s/.test(origChar)) {
// Spaces: advance both if normalized also has space
if (normIdx < normalized.length && /\s/.test(normalized[normIdx])) {
normIdx++;
}
origIdx++;
continue;
}
origIdx++;
normIdx++;
}
return origIdx;
}
/**
* Apply red tags to a single hi.txt line based on boundary information.
*/
function applyRedTags(line, boundaries) {
const parts = line.split("|");
if (parts.length < 7) return line;
const prefix = parts.slice(0, 6).join("|");
let text = parts.slice(6).join("|");
// Don't double-tag
if (text.includes("<red>")) return line;
// Preserve leading ¶ and whitespace
const match = text.match(/^([¶\s]*)(.*)/);
const leading = match[1] || "";
const content = match[2] || "";
if (!content.trim()) return line;
if (boundaries.type === "full") {
return `${prefix}|${leading}<red>${content}</red>`;
}
if (boundaries.type === "prefix") {
const splitIdx = boundaries.splitIdx;
// The splitIdx is relative to the text content (including leading)
const fullText = leading + content;
if (splitIdx >= 0 && splitIdx < fullText.length) {
const before = fullText.substring(0, splitIdx);
const after = fullText.substring(splitIdx);
if (after.trim()) {
return `${prefix}|${before}<red>${after}</red>`;
}
}
}
if (boundaries.type === "suffix") {
const splitIdx = boundaries.splitIdx;
const fullText = leading + content;
if (splitIdx > 0 && splitIdx <= fullText.length) {
const before = fullText.substring(0, splitIdx);
const after = fullText.substring(splitIdx);
if (before.trim()) {
// Find where actual red content starts (after leading)
return `${prefix}|${leading}<red>${before.substring(leading.length)}</red>${after}`;
}
}
}
if (boundaries.type === "both") {
const { prefixIdx, suffixIdx } = boundaries;
const fullText = leading + content;
if (prefixIdx >= 0 && suffixIdx > prefixIdx && suffixIdx <= fullText.length) {
const before = fullText.substring(0, prefixIdx);
const red = fullText.substring(prefixIdx, suffixIdx);
const after = fullText.substring(suffixIdx);
if (red.trim()) {
return `${prefix}|${before}<red>${red}</red>${after}`;
}
}
}
return line;
}
// ============= MAIN =============
// Step 1: Parse all USFM files to get red letter data
console.log("Parsing USFM files for red letter data...");
const allRedData = new Map(); // "bookIdx:chapter:verse" -> {segments, fullyRed}
for (const book of BOOKS) {
const usfmPath = path.join(usfmDir, book.usfmFile);
if (!fs.existsSync(usfmPath)) {
console.log(` Skipping ${book.usfmFile} (not found)`);
continue;
}
const verseData = parseUSFM(usfmPath);
let fullCount = 0;
let partialCount = 0;
for (const [chapVerse, data] of verseData) {
const [chap, verse] = chapVerse.split(":");
// USFM chapters are 1-based, hi.txt uses 0-based
const key = `${book.bookIdx}:${parseInt(chap) - 1}:${parseInt(verse) - 1}`;
allRedData.set(key, data);
if (data.fullyRed) fullCount++;
else partialCount++;
}
console.log(` ${book.usfmFile}: ${fullCount} full, ${partialCount} partial`);
}
console.log(`Total USFM red data: ${allRedData.size} verses`);
// Step 2: Load hi.txt and apply red tags
const hiLines = fs.readFileSync(hiPath, "utf-8").split("\n");
let taggedFull = 0;
let taggedPartial = 0;
let skippedAlready = 0;
let failedPartial = 0;
const newLines = hiLines.map((line) => {
if (!line) return line;
const parts = line.split("|");
if (parts.length < 7) return line;
const bookIdx = parts[1];
const chapIdx = parts[2];
const verseIdx = parts[3];
const key = `${bookIdx}:${chapIdx}:${verseIdx}`;
const redData = allRedData.get(key);
if (!redData) return line;
const text = parts.slice(6).join("|");
if (text.includes("<red>")) {
skippedAlready++;
return line;
}
if (redData.fullyRed) {
// Tag entire verse
const match = text.match(/^([¶\s]*)(.*)/);
const leading = match[1] || "";
const content = match[2] || "";
if (content.trim()) {
taggedFull++;
const prefix = parts.slice(0, 6).join("|");
return `${prefix}|${leading}<red>${content}</red>`;
}
return line;
}
// Partially red verse - try to find boundaries
// Use the text portion only (after the 6th pipe)
const fullText = text;
const boundaries = findRedBoundaries(fullText, redData.segments);
if (boundaries) {
if (boundaries.type === "full") {
// Recategorized as full
const match = text.match(/^([¶\s]*)(.*)/);
const leading = match[1] || "";
const content = match[2] || "";
if (content.trim()) {
taggedFull++;
const prefix = parts.slice(0, 6).join("|");
return `${prefix}|${leading}<red>${content}</red>`;
}
} else {
const result = applyRedTags(line, boundaries);
if (result !== line) {
taggedPartial++;
return result;
}
}
}
failedPartial++;
return line;
});
fs.writeFileSync(hiPath, newLines.join("\n"), "utf-8");
console.log(`\nResults:`);
console.log(` Tagged full: ${taggedFull}`);
console.log(` Tagged partial: ${taggedPartial}`);
console.log(` Already tagged: ${skippedAlready}`);
console.log(` Failed partial: ${failedPartial}`);
console.log(` Total modified: ${taggedFull + taggedPartial}`);
// Results for hi.txt:
// 1,969 total verses with red letter tags
// 1,405 fully-red verses (from the earlier addRedLetters.js via KJV)
// 20 additional fully-red verses discovered from IRV USFM
// 544 partially-red verses — narrative + Jesus's words accurately split
// 77 partial verses that couldn't be reliably matched (left untagged to avoid errors)