only-bible-app

GIT_CONFIG_PARAMETERS="'http.version=HTTP/1.1'" git clone https://git.pyrossh.dev/only-bible-app.git
Discussions: https://groups.google.com/g/rust-embed-devs

The only bible app you will ever need. No ads. No in-app purchases. No distractions.

Readme
Commits
Files
scripts/scrapeBsbRedLetters.js

Contents
History
Blame
import fs from "fs";
import path from "path";
import axios from "axios";
import { fileURLToPath } from "url";

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const bsbPath = path.join(__dirname, "files", "en_bsb.txt");

// Books that have red letters (Jesus's words) with their BibleHub URL slugs
// bookIndex matches the en_bsb.txt field
const BOOKS = [
  { idx: 39, slug: "matthew", chapters: 28 },
  { idx: 40, slug: "mark", chapters: 16 },
  { idx: 41, slug: "luke", chapters: 24 },
  { idx: 42, slug: "john", chapters: 21 },
  { idx: 43, slug: "acts", chapters: 28 },
  { idx: 45, slug: "1_corinthians", chapters: 16 },
  { idx: 46, slug: "2_corinthians", chapters: 13 },
  { idx: 65, slug: "revelation", chapters: 22 },
];

function decodeHtmlEntities(str) {
  return str
    .replace(/&#8220;/g, "\u201C")
    .replace(/&#8221;/g, "\u201D")
    .replace(/&#8216;/g, "\u2018")
    .replace(/&#8217;/g, "\u2019")
    .replace(/&#8212;/g, "\u2014")
    .replace(/&#8211;/g, "\u2013")
    .replace(/&#8230;/g, "\u2026")
    .replace(/&#160;/g, " ")
    .replace(/&amp;/g, "&")
    .replace(/&lt;/g, "<")
    .replace(/&gt;/g, ">")
    .replace(/&quot;/g, '"')
    .replace(/&#\d+;/g, (m) => String.fromCharCode(parseInt(m.slice(2, -1))));
}

function stripHtml(str) {
  return str.replace(/<[^>]+>/g, "");
}

function normalizeText(str) {
  return str.replace(/\s+/g, " ").trim();
}

// Parse a chapter page and return verse data: Map<verseNum, { parts: [{text, red}] }>
function parseChapter(html) {
  const verses = new Map();

  // Extract just the chapter content div
  const chapMatch = html.match(/<div class="chap">([\s\S]*?)<\/div>\s*(?:<div|<A name="fn")/);
  if (!chapMatch) return verses;
  let content = chapMatch[1];

  // Remove headings, cross-references, footnote markers
  content = content.replace(/<p class="hdg">[\s\S]*?(?=<p class="reg">)/g, "");
  content = content.replace(/<span class="cross">[\s\S]*?<\/span>/g, "");
  content = content.replace(/<span class="fn">[\s\S]*?<\/span>/g, "");
  content = content.replace(/<A name="\d+">/g, "");
  content = content.replace(/<p class="indent[^"]*">/g, " ");
  content = content.replace(/<p class="reg">/g, "");

  // Split at verse reference markers to get per-verse segments
  // Pattern: <span class="reftext"><a href="/BOOK/CHAPTER-VERSE.htm"><b>VERSE_NUM</b></a></span>
  const versePattern = /<span class="reftext"><a href="[^"]+"><b>(\d+)<\/b><\/a><\/span>/g;
  const splits = [];
  let match;
  while ((match = versePattern.exec(content)) !== null) {
    splits.push({ verseNum: parseInt(match[1]), index: match.index, afterIndex: match.index + match[0].length });
  }

  for (let i = 0; i < splits.length; i++) {
    const verseNum = splits[i].verseNum;
    const start = splits[i].afterIndex;
    const end = i + 1 < splits.length ? splits[i + 1].index : content.length;
    const segment = content.substring(start, end);

    // Parse red spans within this verse segment
    const parts = [];
    let lastIdx = 0;
    const redPattern = /<span class="red">([\s\S]*?)<\/span>/g;
    let redMatch;
    while ((redMatch = redPattern.exec(segment)) !== null) {
      if (redMatch.index > lastIdx) {
        const before = normalizeText(decodeHtmlEntities(stripHtml(segment.substring(lastIdx, redMatch.index))));
        if (before) parts.push({ text: before, red: false });
      }
      const redText = normalizeText(decodeHtmlEntities(stripHtml(redMatch[1])));
      if (redText) parts.push({ text: redText, red: true });
      lastIdx = redMatch.index + redMatch[0].length;
    }
    if (lastIdx < segment.length) {
      const after = normalizeText(decodeHtmlEntities(stripHtml(segment.substring(lastIdx))));
      if (after) parts.push({ text: after, red: false });
    }

    if (parts.some((p) => p.red)) {
      verses.set(verseNum, parts);
    }
  }

  return verses;
}

// Given verse text from file and parsed red parts, return text with <red> tags
function applyRedTags(verseText, parts) {
  // Check if entire verse is red
  const allRed = parts.every((p) => p.red);
  if (allRed) {
    // Preserve leading ¶ and whitespace
    const match = verseText.match(/^([¶\s]*)(.*)/s);
    const leading = match[1] || "";
    const content = match[2] || "";
    return `${leading}<red>${content}</red>`;
  }

  // For partial red, try to locate each red segment in the verse text
  let result = verseText;
  for (const part of parts) {
    if (!part.red) continue;

    // Try to find the red text in the verse
    // Normalize both for comparison, then find position in original
    const redNorm = part.text;

    // Try exact substring match first
    let idx = result.indexOf(redNorm);
    if (idx === -1) {
      // Try matching with flexible whitespace
      const escaped = redNorm.replace(/[.*+?^${}()|[\]\\]/g, "\\$&").replace(/\s+/g, "\\s+");
      const flexMatch = result.match(new RegExp(escaped));
      if (flexMatch) {
        idx = flexMatch.index;
        const found = flexMatch[0];
        result = result.substring(0, idx) + "<red>" + found + "</red>" + result.substring(idx + found.length);
        continue;
      }

      // Try with quotes stripped from beginning/end for partial quote matching
      // Sometimes the red span includes opening quote but the text starts differently
      const stripped = redNorm.replace(/^[\u201C\u201D\u2018\u2019"']+|[\u201C\u201D\u2018\u2019"']+$/g, "");
      if (stripped.length > 10) {
        idx = result.indexOf(stripped);
        if (idx !== -1) {
          // Expand to include surrounding quotes if present
          let start = idx;
          let end = idx + stripped.length;
          while (start > 0 && /[\u201C\u201D\u2018\u2019"']/.test(result[start - 1])) start--;
          while (end < result.length && /[\u201C\u201D\u2018\u2019"']/.test(result[end])) end++;
          const found = result.substring(start, end);
          result = result.substring(0, start) + "<red>" + found + "</red>" + result.substring(end);
          continue;
        }
      }
      // Skip if we can't find the text
      continue;
    }
    result = result.substring(0, idx) + "<red>" + redNorm + "</red>" + result.substring(idx + redNorm.length);
  }

  // Merge adjacent red tags: </red><red> or </red> <red> etc
  result = result.replace(/<\/red>\s*<red>/g, "");

  return result;
}

async function fetchChapter(slug, chapter) {
  const url = `https://biblehub.com/bsb/${slug}/${chapter}.htm`;
  try {
    const resp = await axios.get(url, {
      headers: { "User-Agent": "Mozilla/5.0" },
      timeout: 15000,
    });
    return resp.data;
  } catch (e) {
    console.error(`  Failed to fetch ${url}: ${e.message}`);
    return null;
  }
}

function sleep(ms) {
  return new Promise((r) => setTimeout(r, ms));
}

async function main() {
  // Step 1: Read BSB file and strip existing <red> tags
  console.log("Reading en_bsb.txt and stripping existing <red> tags...");
  let lines = fs.readFileSync(bsbPath, "utf-8").split("\n");
  lines = lines.map((line) => line.replace(/<\/?red>/g, ""));

  // Index lines by bookIdx:chapterIdx:verseIdx
  const lineIndex = new Map();
  for (let i = 0; i < lines.length; i++) {
    if (!lines[i]) continue;
    const parts = lines[i].split("|");
    if (parts.length < 7) continue;
    const key = `${parts[1]}:${parts[2]}:${parts[3]}`;
    lineIndex.set(key, i);
  }

  let totalTagged = 0;
  let totalPartial = 0;
  let totalFull = 0;

  // Step 2: Scrape each book/chapter
  for (const book of BOOKS) {
    console.log(`\nProcessing ${book.slug} (${book.chapters} chapters)...`);
    let bookTagged = 0;

    for (let ch = 1; ch <= book.chapters; ch++) {
      const html = await fetchChapter(book.slug, ch);
      if (!html) continue;

      const redVerses = parseChapter(html);
      if (redVerses.size === 0) continue;

      for (const [verseNum, parts] of redVerses) {
        // BibleHub uses 1-based verses, our file uses 0-based
        const chIdx = ch - 1;
        const vIdx = verseNum - 1;
        const key = `${book.idx}:${chIdx}:${vIdx}`;
        const lineIdx = lineIndex.get(key);

        if (lineIdx === undefined) {
          console.warn(`  Verse not found: ${book.slug} ${ch}:${verseNum} (key=${key})`);
          continue;
        }

        const lineParts = lines[lineIdx].split("|");
        const prefix = lineParts.slice(0, 6).join("|");
        const verseText = lineParts.slice(6).join("|");

        const tagged = applyRedTags(verseText, parts);
        if (tagged !== verseText) {
          lines[lineIdx] = `${prefix}|${tagged}`;
          bookTagged++;
          if (parts.every((p) => p.red)) totalFull++;
          else totalPartial++;
        }
      }

      // Small delay to be polite
      await sleep(200);
    }

    console.log(`  ${book.slug}: tagged ${bookTagged} verses`);
    totalTagged += bookTagged;
  }

  // Step 3: Write updated file
  fs.writeFileSync(bsbPath, lines.join("\n"), "utf-8");
  console.log(`\nDone! Tagged ${totalTagged} verses total (${totalFull} full, ${totalPartial} partial)`);
}

main().catch(console.error);
~repos /only-bible-app

GIT_CONFIG_PARAMETERS="'http.version=HTTP/1.1'" git clone https://git.pyrossh.dev/only-bible-app.git Discussions: https://groups.google.com/g/rust-embed-devs

The only bible app you will ever need. No ads. No in-app purchases. No distractions.

scripts/scrapeBsbRedLetters.js

GIT_CONFIG_PARAMETERS="'http.version=HTTP/1.1'" git clone https://git.pyrossh.dev/only-bible-app.git
Discussions: https://groups.google.com/g/rust-embed-devs