import axios from "axios";
import { fileURLToPath } from "url";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const bsbPath = path.join(__dirname, "files", "en_bsb.txt");
{ idx: 39, slug: "matthew", chapters: 28 },
{ idx: 40, slug: "mark", chapters: 16 },
{ idx: 41, slug: "luke", chapters: 24 },
{ idx: 42, slug: "john", chapters: 21 },
{ idx: 43, slug: "acts", chapters: 28 },
{ idx: 45, slug: "1_corinthians", chapters: 16 },
{ idx: 46, slug: "2_corinthians", chapters: 13 },
{ idx: 65, slug: "revelation", chapters: 22 },
function decodeEntities(str) {
.replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n)))
.replace(/"/g, '"');
function stripTags(str) {
return str.replace(/<[^>]+>/g, "");
return str.replace(/\s+/g, " ").trim();
// New approach: walk through HTML character by character tracking red state
function parseChapter(html) {
// Extract the chapter div content
const chapStart = html.indexOf('<div class="chap">');
if (chapStart === -1) return new Map();
// Find the end: next <div id="fnlink"> or <A name="fn"> or <div id="botbox">
let chapEnd = html.indexOf('<A name="fn">', chapStart);
if (chapEnd === -1) chapEnd = html.indexOf('<div id="botbox">', chapStart);
if (chapEnd === -1) chapEnd = html.indexOf('</div>', chapStart + 5000);
if (chapEnd === -1) return new Map();
const content = html.substring(chapStart, chapEnd);
// Tokenize: we need to know for each character whether we're inside <span class="red"> or not
// Strategy: find all opening <span class="red"> and </span> and verse markers,
// then walk through collecting per-verse red/non-red text
// Find all markers in order
const verseRe = /<span class="reftext"><a href="[^"]*?(\d+)-(\d+)\.htm"><b>(\d+)<\/b><\/a><\/span>/g;
while ((m = verseRe.exec(content)) !== null) {
markers.push({ type: "verse", verseNum: parseInt(m[3]), pos: m.index, end: m.index + m[0].length });
const redOpenRe = /<span class="red">/g;
while ((m = redOpenRe.exec(content)) !== null) {
markers.push({ type: "red_open", pos: m.index, end: m.index + m[0].length });
// We need to match closing </span> with the correct opening <span class="red">
// Since spans can nest, track open red spans
// Simpler: use a state machine approach
// Let's just mark red regions by finding matching close for each red open
// Sort markers by position
markers.sort((a, b) => a.pos - b.pos);
// Now walk through the HTML, tracking red depth
// For each position, determine if we're in red and which verse we're in
const verseData = new Map(); // verseNum -> [{text, red}]
// More robust: process the content as a stream of tokens
// Token types: red_open, red_close (any </span> that closes a red), verse_marker, text, other_tag
// Let's re-parse more carefully
// Find all <span class="red"> positions and their matching </span>
const redRanges = findRedRanges(content);
// Now for each verse, check which parts overlap with red ranges
const verseMarkers = markers.filter((m) => m.type === "verse").sort((a, b) => a.pos - b.pos);
for (let i = 0; i < verseMarkers.length; i++) {
const vStart = verseMarkers[i].end;
const vEnd = i + 1 < verseMarkers.length ? verseMarkers[i + 1].pos : content.length;
const verseNum = verseMarkers[i].verseNum;
const verseHtml = content.substring(vStart, vEnd);
// Determine which parts of this verse's text range are inside red ranges
// Get plain text segments with red annotation
// Walk through the verse HTML range checking red overlap
const segments = getRedSegments(content, vStart, vEnd, redRanges);
if (segments.some((s) => s.red)) {
verseData.set(verseNum, segments);
// Find all <span class="red">...</span> ranges, handling nesting
function findRedRanges(html) {
const spanOpenRe = /<span\b[^>]*>/g;
const spanCloseRe = /<\/span>/g;
// Find all red span opens
const redOpenRe = /<span class="red">/g;
while ((m = redOpenRe.exec(html)) !== null) {
// For each red open, find its matching close by counting nested spans
for (const startPos of redOpens) {
const afterOpen = startPos + '<span class="red">'.length;
let searchPos = afterOpen;
while (depth > 0 && searchPos < html.length) {
const nextOpen = html.indexOf("<span", searchPos);
const nextClose = html.indexOf("</span>", searchPos);
if (nextClose === -1) break;
if (nextOpen !== -1 && nextOpen < nextClose) {
// Check if it's actually a span tag (not just text containing "<span")
const tagEnd = html.indexOf(">", nextOpen);
if (tagEnd !== -1 && html.substring(nextOpen, tagEnd + 1).match(/^<span\b/)) {
searchPos = nextOpen + 1;
ranges.push({ start: startPos, contentStart: afterOpen, end: nextClose + "</span>".length, contentEnd: nextClose });
searchPos = nextClose + "</span>".length;
// Get text segments for a verse range, annotated with red/not-red
function getRedSegments(html, vStart, vEnd, redRanges) {
// Determine which character positions in [vStart, vEnd) are "red"
const isRed = new Array(vEnd - vStart).fill(false);
for (const range of redRanges) {
const overlapStart = Math.max(range.start, vStart);
const overlapEnd = Math.min(range.end, vEnd);
if (overlapStart < overlapEnd) {
for (let i = overlapStart - vStart; i < overlapEnd - vStart; i++) {
// Now extract text, splitting into red/non-red segments
const rawHtml = html.substring(vStart, vEnd);
for (let i = 0; i < rawHtml.length; i++) {
// Skip tags but check for specific ones we want to keep
// Skip all HTML tags (span, a, b, p, etc.)
if (red !== currentRed && currentText) {
const decoded = norm(decodeEntities(currentText));
if (decoded) segments.push({ text: decoded, red: currentRed });
const decoded = norm(decodeEntities(currentText));
if (decoded) segments.push({ text: decoded, red: currentRed });
function applyRedTags(verseText, parts) {
const allRed = parts.every((p) => p.red);
const match = verseText.match(/^([¶\s]*)(.*)/s);
const leading = match[1] || "";
const content = match[2] || "";
return `${leading}<red>${content}</red>`;
// For partial red, find each red segment in the verse text
for (const part of parts) {
const redText = part.text;
let idx = result.indexOf(redText);
if (idx !== -1 && !isInsideRedTag(result, idx)) {
result = result.substring(0, idx) + "<red>" + redText + "</red>" + result.substring(idx + redText.length);
// Try flexible whitespace match
const escaped = redText.replace(/[.*+?^${}()|[\]\\]/g, "\\$&").replace(/\s+/g, "\\s+");
const flexMatch = result.match(new RegExp(escaped));
if (flexMatch && !isInsideRedTag(result, flexMatch.index)) {
const found = flexMatch[0];
const fi = flexMatch.index;
result = result.substring(0, fi) + "<red>" + found + "</red>" + result.substring(fi + found.length);
// Try stripping quotes from the match text
const stripped = redText.replace(/^[\u201C\u201D\u2018\u2019"']+|[\u201C\u201D\u2018\u2019"']+$/g, "");
if (stripped.length > 5) {
idx = result.indexOf(stripped);
if (idx !== -1 && !isInsideRedTag(result, idx)) {
let end = idx + stripped.length;
while (start > 0 && /[\u201C\u201D\u2018\u2019"']/.test(result[start - 1])) start--;
while (end < result.length && /[\u201C\u201D\u2018\u2019"']/.test(result[end])) end++;
const found = result.substring(start, end);
result = result.substring(0, start) + "<red>" + found + "</red>" + result.substring(end);
// Merge adjacent red tags
result = result.replace(/<\/red>\s*<red>/g, "");
function isInsideRedTag(str, idx) {
const before = str.substring(0, idx);
const lastOpen = before.lastIndexOf("<red>");
const lastClose = before.lastIndexOf("</red>");
return lastOpen > lastClose;
async function fetchChapter(slug, chapter) {
const url = `https://biblehub.com/bsb/${slug}/${chapter}.htm`;
const resp = await axios.get(url, {
headers: { "User-Agent": "Mozilla/5.0" },
console.error(` Failed to fetch ${url}: ${e.message}`);
return new Promise((r) => setTimeout(r, ms));
console.log("Reading en_bsb.txt and stripping existing <red> tags...");
let lines = fs.readFileSync(bsbPath, "utf-8").split("\n");
lines = lines.map((line) => line.replace(/<\/?red>/g, ""));
const lineIndex = new Map();
for (let i = 0; i < lines.length; i++) {
const parts = lines[i].split("|");
if (parts.length < 7) continue;
const key = `${parts[1]}:${parts[2]}:${parts[3]}`;
for (const book of BOOKS) {
console.log(`\nProcessing ${book.slug} (${book.chapters} chapters)...`);
for (let ch = 1; ch <= book.chapters; ch++) {
const html = await fetchChapter(book.slug, ch);
const redVerses = parseChapter(html);
for (const [verseNum, parts] of redVerses) {
const vIdx = verseNum - 1;
const key = `${book.idx}:${chIdx}:${vIdx}`;
const lineIdx = lineIndex.get(key);
if (lineIdx === undefined) {
const lineParts = lines[lineIdx].split("|");
const prefix = lineParts.slice(0, 6).join("|");
const verseText = lineParts.slice(6).join("|");
const tagged = applyRedTags(verseText, parts);
if (tagged !== verseText) {
lines[lineIdx] = `${prefix}|${tagged}`;
if (parts.every((p) => p.red)) totalFull++;
console.log(` ${book.slug}: tagged ${bookTagged} verses`);
totalTagged += bookTagged;
fs.writeFileSync(bsbPath, lines.join("\n"), "utf-8");
console.log(`\nDone! Tagged ${totalTagged} verses total (${totalFull} full, ${totalPartial} partial)`);
main().catch(console.error);