only-bible-app

GIT_CONFIG_PARAMETERS="'http.version=HTTP/1.1'" git clone https://git.pyrossh.dev/only-bible-app.git
Discussions: https://groups.google.com/g/rust-embed-devs

The only bible app you will ever need. No ads. No in-app purchases. No distractions.

Readme
Commits
Files
scripts/scrapeBsbRedLetters2.js

Contents
History
Blame
import fs from "fs";
import path from "path";
import axios from "axios";
import { fileURLToPath } from "url";

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const bsbPath = path.join(__dirname, "files", "en_bsb.txt");

const BOOKS = [
  { idx: 39, slug: "matthew", chapters: 28 },
  { idx: 40, slug: "mark", chapters: 16 },
  { idx: 41, slug: "luke", chapters: 24 },
  { idx: 42, slug: "john", chapters: 21 },
  { idx: 43, slug: "acts", chapters: 28 },
  { idx: 45, slug: "1_corinthians", chapters: 16 },
  { idx: 46, slug: "2_corinthians", chapters: 13 },
  { idx: 65, slug: "revelation", chapters: 22 },
];

function decodeEntities(str) {
  return str
    .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n)))
    .replace(/&amp;/g, "&")
    .replace(/&lt;/g, "<")
    .replace(/&gt;/g, ">")
    .replace(/&quot;/g, '"');
}

function stripTags(str) {
  return str.replace(/<[^>]+>/g, "");
}

function norm(str) {
  return str.replace(/\s+/g, " ").trim();
}

// New approach: walk through HTML character by character tracking red state
function parseChapter(html) {
  // Extract the chapter div content
  const chapStart = html.indexOf('<div class="chap">');
  if (chapStart === -1) return new Map();

  // Find the end: next <div id="fnlink"> or <A name="fn"> or <div id="botbox">
  let chapEnd = html.indexOf('<A name="fn">', chapStart);
  if (chapEnd === -1) chapEnd = html.indexOf('<div id="botbox">', chapStart);
  if (chapEnd === -1) chapEnd = html.indexOf('</div>', chapStart + 5000);
  if (chapEnd === -1) return new Map();

  const content = html.substring(chapStart, chapEnd);

  // Tokenize: we need to know for each character whether we're inside <span class="red"> or not
  // Strategy: find all opening <span class="red"> and </span> and verse markers,
  // then walk through collecting per-verse red/non-red text

  // Find all markers in order
  const markers = [];

  // Verse references
  const verseRe = /<span class="reftext"><a href="[^"]*?(\d+)-(\d+)\.htm"><b>(\d+)<\/b><\/a><\/span>/g;
  let m;
  while ((m = verseRe.exec(content)) !== null) {
    markers.push({ type: "verse", verseNum: parseInt(m[3]), pos: m.index, end: m.index + m[0].length });
  }

  // Red span opens
  const redOpenRe = /<span class="red">/g;
  while ((m = redOpenRe.exec(content)) !== null) {
    markers.push({ type: "red_open", pos: m.index, end: m.index + m[0].length });
  }

  // We need to match closing </span> with the correct opening <span class="red">
  // Since spans can nest, track open red spans
  // Simpler: use a state machine approach
  // Let's just mark red regions by finding matching close for each red open

  // Sort markers by position
  markers.sort((a, b) => a.pos - b.pos);

  // Now walk through the HTML, tracking red depth
  // For each position, determine if we're in red and which verse we're in
  const verseData = new Map(); // verseNum -> [{text, red}]
  let currentVerse = 0;
  let inRed = 0;
  let pos = 0;

  // More robust: process the content as a stream of tokens
  // Token types: red_open, red_close (any </span> that closes a red), verse_marker, text, other_tag

  // Let's re-parse more carefully
  // Find all <span class="red"> positions and their matching </span>
  const redRanges = findRedRanges(content);

  // Now for each verse, check which parts overlap with red ranges
  const verseMarkers = markers.filter((m) => m.type === "verse").sort((a, b) => a.pos - b.pos);

  for (let i = 0; i < verseMarkers.length; i++) {
    const vStart = verseMarkers[i].end;
    const vEnd = i + 1 < verseMarkers.length ? verseMarkers[i + 1].pos : content.length;
    const verseNum = verseMarkers[i].verseNum;
    const verseHtml = content.substring(vStart, vEnd);

    // Determine which parts of this verse's text range are inside red ranges
    const parts = [];
    let hasRed = false;

    // Get plain text segments with red annotation
    // Walk through the verse HTML range checking red overlap
    let textPos = vStart;
    const segments = getRedSegments(content, vStart, vEnd, redRanges);

    if (segments.some((s) => s.red)) {
      verseData.set(verseNum, segments);
    }
  }

  return verseData;
}

// Find all <span class="red">...</span> ranges, handling nesting
function findRedRanges(html) {
  const ranges = [];
  const spanOpenRe = /<span\b[^>]*>/g;
  const spanCloseRe = /<\/span>/g;

  // Find all red span opens
  const redOpens = [];
  let m;
  const redOpenRe = /<span class="red">/g;
  while ((m = redOpenRe.exec(html)) !== null) {
    redOpens.push(m.index);
  }

  // For each red open, find its matching close by counting nested spans
  for (const startPos of redOpens) {
    const afterOpen = startPos + '<span class="red">'.length;
    let depth = 1;
    let searchPos = afterOpen;

    while (depth > 0 && searchPos < html.length) {
      const nextOpen = html.indexOf("<span", searchPos);
      const nextClose = html.indexOf("</span>", searchPos);

      if (nextClose === -1) break;

      if (nextOpen !== -1 && nextOpen < nextClose) {
        // Check if it's actually a span tag (not just text containing "<span")
        const tagEnd = html.indexOf(">", nextOpen);
        if (tagEnd !== -1 && html.substring(nextOpen, tagEnd + 1).match(/^<span\b/)) {
          depth++;
          searchPos = tagEnd + 1;
        } else {
          searchPos = nextOpen + 1;
        }
      } else {
        depth--;
        if (depth === 0) {
          ranges.push({ start: startPos, contentStart: afterOpen, end: nextClose + "</span>".length, contentEnd: nextClose });
        }
        searchPos = nextClose + "</span>".length;
      }
    }
  }

  return ranges;
}

// Get text segments for a verse range, annotated with red/not-red
function getRedSegments(html, vStart, vEnd, redRanges) {
  // Determine which character positions in [vStart, vEnd) are "red"
  const isRed = new Array(vEnd - vStart).fill(false);

  for (const range of redRanges) {
    const overlapStart = Math.max(range.start, vStart);
    const overlapEnd = Math.min(range.end, vEnd);
    if (overlapStart < overlapEnd) {
      for (let i = overlapStart - vStart; i < overlapEnd - vStart; i++) {
        isRed[i] = true;
      }
    }
  }

  // Now extract text, splitting into red/non-red segments
  const rawHtml = html.substring(vStart, vEnd);
  const segments = [];
  let currentText = "";
  let currentRed = false;
  let inTag = false;
  let tagContent = "";

  for (let i = 0; i < rawHtml.length; i++) {
    const ch = rawHtml[i];
    const red = isRed[i];

    if (ch === "<") {
      inTag = true;
      tagContent = "<";
      continue;
    }
    if (inTag) {
      tagContent += ch;
      if (ch === ">") {
        inTag = false;
        // Skip tags but check for specific ones we want to keep
        // Skip all HTML tags (span, a, b, p, etc.)
      }
      continue;
    }

    // Text character
    if (red !== currentRed && currentText) {
      const decoded = norm(decodeEntities(currentText));
      if (decoded) segments.push({ text: decoded, red: currentRed });
      currentText = "";
    }
    currentRed = red;
    currentText += ch;
  }

  if (currentText) {
    const decoded = norm(decodeEntities(currentText));
    if (decoded) segments.push({ text: decoded, red: currentRed });
  }

  return segments;
}

function applyRedTags(verseText, parts) {
  const allRed = parts.every((p) => p.red);
  if (allRed) {
    const match = verseText.match(/^([¶\s]*)(.*)/s);
    const leading = match[1] || "";
    const content = match[2] || "";
    return `${leading}<red>${content}</red>`;
  }

  // For partial red, find each red segment in the verse text
  let result = verseText;
  for (const part of parts) {
    if (!part.red) continue;
    const redText = part.text;

    // Try exact match
    let idx = result.indexOf(redText);
    if (idx !== -1 && !isInsideRedTag(result, idx)) {
      result = result.substring(0, idx) + "<red>" + redText + "</red>" + result.substring(idx + redText.length);
      continue;
    }

    // Try flexible whitespace match
    const escaped = redText.replace(/[.*+?^${}()|[\]\\]/g, "\\$&").replace(/\s+/g, "\\s+");
    const flexMatch = result.match(new RegExp(escaped));
    if (flexMatch && !isInsideRedTag(result, flexMatch.index)) {
      const found = flexMatch[0];
      const fi = flexMatch.index;
      result = result.substring(0, fi) + "<red>" + found + "</red>" + result.substring(fi + found.length);
      continue;
    }

    // Try stripping quotes from the match text
    const stripped = redText.replace(/^[\u201C\u201D\u2018\u2019"']+|[\u201C\u201D\u2018\u2019"']+$/g, "");
    if (stripped.length > 5) {
      idx = result.indexOf(stripped);
      if (idx !== -1 && !isInsideRedTag(result, idx)) {
        let start = idx;
        let end = idx + stripped.length;
        while (start > 0 && /[\u201C\u201D\u2018\u2019"']/.test(result[start - 1])) start--;
        while (end < result.length && /[\u201C\u201D\u2018\u2019"']/.test(result[end])) end++;
        const found = result.substring(start, end);
        result = result.substring(0, start) + "<red>" + found + "</red>" + result.substring(end);
        continue;
      }
    }
  }

  // Merge adjacent red tags
  result = result.replace(/<\/red>\s*<red>/g, "");
  return result;
}

function isInsideRedTag(str, idx) {
  const before = str.substring(0, idx);
  const lastOpen = before.lastIndexOf("<red>");
  const lastClose = before.lastIndexOf("</red>");
  return lastOpen > lastClose;
}

async function fetchChapter(slug, chapter) {
  const url = `https://biblehub.com/bsb/${slug}/${chapter}.htm`;
  try {
    const resp = await axios.get(url, {
      headers: { "User-Agent": "Mozilla/5.0" },
      timeout: 15000,
    });
    return resp.data;
  } catch (e) {
    console.error(`  Failed to fetch ${url}: ${e.message}`);
    return null;
  }
}

function sleep(ms) {
  return new Promise((r) => setTimeout(r, ms));
}

async function main() {
  console.log("Reading en_bsb.txt and stripping existing <red> tags...");
  let lines = fs.readFileSync(bsbPath, "utf-8").split("\n");
  lines = lines.map((line) => line.replace(/<\/?red>/g, ""));

  const lineIndex = new Map();
  for (let i = 0; i < lines.length; i++) {
    if (!lines[i]) continue;
    const parts = lines[i].split("|");
    if (parts.length < 7) continue;
    const key = `${parts[1]}:${parts[2]}:${parts[3]}`;
    lineIndex.set(key, i);
  }

  let totalTagged = 0;
  let totalPartial = 0;
  let totalFull = 0;

  for (const book of BOOKS) {
    console.log(`\nProcessing ${book.slug} (${book.chapters} chapters)...`);
    let bookTagged = 0;

    for (let ch = 1; ch <= book.chapters; ch++) {
      const html = await fetchChapter(book.slug, ch);
      if (!html) continue;

      const redVerses = parseChapter(html);

      for (const [verseNum, parts] of redVerses) {
        const chIdx = ch - 1;
        const vIdx = verseNum - 1;
        const key = `${book.idx}:${chIdx}:${vIdx}`;
        const lineIdx = lineIndex.get(key);

        if (lineIdx === undefined) {
          continue;
        }

        const lineParts = lines[lineIdx].split("|");
        const prefix = lineParts.slice(0, 6).join("|");
        const verseText = lineParts.slice(6).join("|");

        const tagged = applyRedTags(verseText, parts);
        if (tagged !== verseText) {
          lines[lineIdx] = `${prefix}|${tagged}`;
          bookTagged++;
          if (parts.every((p) => p.red)) totalFull++;
          else totalPartial++;
        }
      }

      await sleep(200);
    }

    console.log(`  ${book.slug}: tagged ${bookTagged} verses`);
    totalTagged += bookTagged;
  }

  fs.writeFileSync(bsbPath, lines.join("\n"), "utf-8");
  console.log(`\nDone! Tagged ${totalTagged} verses total (${totalFull} full, ${totalPartial} partial)`);
}

main().catch(console.error);
~repos /only-bible-app

GIT_CONFIG_PARAMETERS="'http.version=HTTP/1.1'" git clone https://git.pyrossh.dev/only-bible-app.git Discussions: https://groups.google.com/g/rust-embed-devs

The only bible app you will ever need. No ads. No in-app purchases. No distractions.

scripts/scrapeBsbRedLetters2.js

GIT_CONFIG_PARAMETERS="'http.version=HTTP/1.1'" git clone https://git.pyrossh.dev/only-bible-app.git
Discussions: https://groups.google.com/g/rust-embed-devs