~repos /only-bible-app

#kotlin#android#ios

GIT_CONFIG_PARAMETERS="'http.version=HTTP/1.1'" git clone https://git.pyrossh.dev/only-bible-app.git
Discussions: https://groups.google.com/g/rust-embed-devs

The only bible app you will ever need. No ads. No in-app purchases. No distractions.



scripts/scrapeBsbRedLetters2.js



import fs from "fs";
import path from "path";
import axios from "axios";
import { fileURLToPath } from "url";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const bsbPath = path.join(__dirname, "files", "en_bsb.txt");
const BOOKS = [
{ idx: 39, slug: "matthew", chapters: 28 },
{ idx: 40, slug: "mark", chapters: 16 },
{ idx: 41, slug: "luke", chapters: 24 },
{ idx: 42, slug: "john", chapters: 21 },
{ idx: 43, slug: "acts", chapters: 28 },
{ idx: 45, slug: "1_corinthians", chapters: 16 },
{ idx: 46, slug: "2_corinthians", chapters: 13 },
{ idx: 65, slug: "revelation", chapters: 22 },
];
function decodeEntities(str) {
return str
.replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n)))
.replace(/&/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"');
}
function stripTags(str) {
return str.replace(/<[^>]+>/g, "");
}
function norm(str) {
return str.replace(/\s+/g, " ").trim();
}
// New approach: walk through HTML character by character tracking red state
function parseChapter(html) {
// Extract the chapter div content
const chapStart = html.indexOf('<div class="chap">');
if (chapStart === -1) return new Map();
// Find the end: next <div id="fnlink"> or <A name="fn"> or <div id="botbox">
let chapEnd = html.indexOf('<A name="fn">', chapStart);
if (chapEnd === -1) chapEnd = html.indexOf('<div id="botbox">', chapStart);
if (chapEnd === -1) chapEnd = html.indexOf('</div>', chapStart + 5000);
if (chapEnd === -1) return new Map();
const content = html.substring(chapStart, chapEnd);
// Tokenize: we need to know for each character whether we're inside <span class="red"> or not
// Strategy: find all opening <span class="red"> and </span> and verse markers,
// then walk through collecting per-verse red/non-red text
// Find all markers in order
const markers = [];
// Verse references
const verseRe = /<span class="reftext"><a href="[^"]*?(\d+)-(\d+)\.htm"><b>(\d+)<\/b><\/a><\/span>/g;
let m;
while ((m = verseRe.exec(content)) !== null) {
markers.push({ type: "verse", verseNum: parseInt(m[3]), pos: m.index, end: m.index + m[0].length });
}
// Red span opens
const redOpenRe = /<span class="red">/g;
while ((m = redOpenRe.exec(content)) !== null) {
markers.push({ type: "red_open", pos: m.index, end: m.index + m[0].length });
}
// We need to match closing </span> with the correct opening <span class="red">
// Since spans can nest, track open red spans
// Simpler: use a state machine approach
// Let's just mark red regions by finding matching close for each red open
// Sort markers by position
markers.sort((a, b) => a.pos - b.pos);
// Now walk through the HTML, tracking red depth
// For each position, determine if we're in red and which verse we're in
const verseData = new Map(); // verseNum -> [{text, red}]
let currentVerse = 0;
let inRed = 0;
let pos = 0;
// More robust: process the content as a stream of tokens
// Token types: red_open, red_close (any </span> that closes a red), verse_marker, text, other_tag
// Let's re-parse more carefully
// Find all <span class="red"> positions and their matching </span>
const redRanges = findRedRanges(content);
// Now for each verse, check which parts overlap with red ranges
const verseMarkers = markers.filter((m) => m.type === "verse").sort((a, b) => a.pos - b.pos);
for (let i = 0; i < verseMarkers.length; i++) {
const vStart = verseMarkers[i].end;
const vEnd = i + 1 < verseMarkers.length ? verseMarkers[i + 1].pos : content.length;
const verseNum = verseMarkers[i].verseNum;
const verseHtml = content.substring(vStart, vEnd);
// Determine which parts of this verse's text range are inside red ranges
const parts = [];
let hasRed = false;
// Get plain text segments with red annotation
// Walk through the verse HTML range checking red overlap
let textPos = vStart;
const segments = getRedSegments(content, vStart, vEnd, redRanges);
if (segments.some((s) => s.red)) {
verseData.set(verseNum, segments);
}
}
return verseData;
}
// Find all <span class="red">...</span> ranges, handling nesting
function findRedRanges(html) {
const ranges = [];
const spanOpenRe = /<span\b[^>]*>/g;
const spanCloseRe = /<\/span>/g;
// Find all red span opens
const redOpens = [];
let m;
const redOpenRe = /<span class="red">/g;
while ((m = redOpenRe.exec(html)) !== null) {
redOpens.push(m.index);
}
// For each red open, find its matching close by counting nested spans
for (const startPos of redOpens) {
const afterOpen = startPos + '<span class="red">'.length;
let depth = 1;
let searchPos = afterOpen;
while (depth > 0 && searchPos < html.length) {
const nextOpen = html.indexOf("<span", searchPos);
const nextClose = html.indexOf("</span>", searchPos);
if (nextClose === -1) break;
if (nextOpen !== -1 && nextOpen < nextClose) {
// Check if it's actually a span tag (not just text containing "<span")
const tagEnd = html.indexOf(">", nextOpen);
if (tagEnd !== -1 && html.substring(nextOpen, tagEnd + 1).match(/^<span\b/)) {
depth++;
searchPos = tagEnd + 1;
} else {
searchPos = nextOpen + 1;
}
} else {
depth--;
if (depth === 0) {
ranges.push({ start: startPos, contentStart: afterOpen, end: nextClose + "</span>".length, contentEnd: nextClose });
}
searchPos = nextClose + "</span>".length;
}
}
}
return ranges;
}
// Get text segments for a verse range, annotated with red/not-red
function getRedSegments(html, vStart, vEnd, redRanges) {
// Determine which character positions in [vStart, vEnd) are "red"
const isRed = new Array(vEnd - vStart).fill(false);
for (const range of redRanges) {
const overlapStart = Math.max(range.start, vStart);
const overlapEnd = Math.min(range.end, vEnd);
if (overlapStart < overlapEnd) {
for (let i = overlapStart - vStart; i < overlapEnd - vStart; i++) {
isRed[i] = true;
}
}
}
// Now extract text, splitting into red/non-red segments
const rawHtml = html.substring(vStart, vEnd);
const segments = [];
let currentText = "";
let currentRed = false;
let inTag = false;
let tagContent = "";
for (let i = 0; i < rawHtml.length; i++) {
const ch = rawHtml[i];
const red = isRed[i];
if (ch === "<") {
inTag = true;
tagContent = "<";
continue;
}
if (inTag) {
tagContent += ch;
if (ch === ">") {
inTag = false;
// Skip tags but check for specific ones we want to keep
// Skip all HTML tags (span, a, b, p, etc.)
}
continue;
}
// Text character
if (red !== currentRed && currentText) {
const decoded = norm(decodeEntities(currentText));
if (decoded) segments.push({ text: decoded, red: currentRed });
currentText = "";
}
currentRed = red;
currentText += ch;
}
if (currentText) {
const decoded = norm(decodeEntities(currentText));
if (decoded) segments.push({ text: decoded, red: currentRed });
}
return segments;
}
function applyRedTags(verseText, parts) {
const allRed = parts.every((p) => p.red);
if (allRed) {
const match = verseText.match(/^([¶\s]*)(.*)/s);
const leading = match[1] || "";
const content = match[2] || "";
return `${leading}<red>${content}</red>`;
}
// For partial red, find each red segment in the verse text
let result = verseText;
for (const part of parts) {
if (!part.red) continue;
const redText = part.text;
// Try exact match
let idx = result.indexOf(redText);
if (idx !== -1 && !isInsideRedTag(result, idx)) {
result = result.substring(0, idx) + "<red>" + redText + "</red>" + result.substring(idx + redText.length);
continue;
}
// Try flexible whitespace match
const escaped = redText.replace(/[.*+?^${}()|[\]\\]/g, "\\$&").replace(/\s+/g, "\\s+");
const flexMatch = result.match(new RegExp(escaped));
if (flexMatch && !isInsideRedTag(result, flexMatch.index)) {
const found = flexMatch[0];
const fi = flexMatch.index;
result = result.substring(0, fi) + "<red>" + found + "</red>" + result.substring(fi + found.length);
continue;
}
// Try stripping quotes from the match text
const stripped = redText.replace(/^[\u201C\u201D\u2018\u2019"']+|[\u201C\u201D\u2018\u2019"']+$/g, "");
if (stripped.length > 5) {
idx = result.indexOf(stripped);
if (idx !== -1 && !isInsideRedTag(result, idx)) {
let start = idx;
let end = idx + stripped.length;
while (start > 0 && /[\u201C\u201D\u2018\u2019"']/.test(result[start - 1])) start--;
while (end < result.length && /[\u201C\u201D\u2018\u2019"']/.test(result[end])) end++;
const found = result.substring(start, end);
result = result.substring(0, start) + "<red>" + found + "</red>" + result.substring(end);
continue;
}
}
}
// Merge adjacent red tags
result = result.replace(/<\/red>\s*<red>/g, "");
return result;
}
function isInsideRedTag(str, idx) {
const before = str.substring(0, idx);
const lastOpen = before.lastIndexOf("<red>");
const lastClose = before.lastIndexOf("</red>");
return lastOpen > lastClose;
}
async function fetchChapter(slug, chapter) {
const url = `https://biblehub.com/bsb/${slug}/${chapter}.htm`;
try {
const resp = await axios.get(url, {
headers: { "User-Agent": "Mozilla/5.0" },
timeout: 15000,
});
return resp.data;
} catch (e) {
console.error(` Failed to fetch ${url}: ${e.message}`);
return null;
}
}
function sleep(ms) {
return new Promise((r) => setTimeout(r, ms));
}
async function main() {
console.log("Reading en_bsb.txt and stripping existing <red> tags...");
let lines = fs.readFileSync(bsbPath, "utf-8").split("\n");
lines = lines.map((line) => line.replace(/<\/?red>/g, ""));
const lineIndex = new Map();
for (let i = 0; i < lines.length; i++) {
if (!lines[i]) continue;
const parts = lines[i].split("|");
if (parts.length < 7) continue;
const key = `${parts[1]}:${parts[2]}:${parts[3]}`;
lineIndex.set(key, i);
}
let totalTagged = 0;
let totalPartial = 0;
let totalFull = 0;
for (const book of BOOKS) {
console.log(`\nProcessing ${book.slug} (${book.chapters} chapters)...`);
let bookTagged = 0;
for (let ch = 1; ch <= book.chapters; ch++) {
const html = await fetchChapter(book.slug, ch);
if (!html) continue;
const redVerses = parseChapter(html);
for (const [verseNum, parts] of redVerses) {
const chIdx = ch - 1;
const vIdx = verseNum - 1;
const key = `${book.idx}:${chIdx}:${vIdx}`;
const lineIdx = lineIndex.get(key);
if (lineIdx === undefined) {
continue;
}
const lineParts = lines[lineIdx].split("|");
const prefix = lineParts.slice(0, 6).join("|");
const verseText = lineParts.slice(6).join("|");
const tagged = applyRedTags(verseText, parts);
if (tagged !== verseText) {
lines[lineIdx] = `${prefix}|${tagged}`;
bookTagged++;
if (parts.every((p) => p.red)) totalFull++;
else totalPartial++;
}
}
await sleep(200);
}
console.log(` ${book.slug}: tagged ${bookTagged} verses`);
totalTagged += bookTagged;
}
fs.writeFileSync(bsbPath, lines.join("\n"), "utf-8");
console.log(`\nDone! Tagged ${totalTagged} verses total (${totalFull} full, ${totalPartial} partial)`);
}
main().catch(console.error);