// STVS content script
// Structure-driven, domain-agnostic article extraction with generic DOM-based recommendation detection.

const STVS_MIN_LENGTH = 400;

if (!window.__STVS_CONTENT_INITIALIZED__) {
  window.__STVS_CONTENT_INITIALIZED__ = true;

  chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
    if (!message) return false;

    if (message.type === "STVS_GET_DOM") {
      try {
        const html = document.documentElement.outerHTML;
        const maxLen = 500000;
        const dom = html.length > maxLen ? html.slice(0, maxLen) + "\n<!-- truncated -->" : html;
        sendResponse({ ok: true, dom });
      } catch (e) {
        sendResponse({ ok: false, error: (e && e.message) || String(e) });
      }
      return true;
    }

    if (message.type === "STVS_EXTRACT_WITH_SELECTORS") {
      const { rootSelector, paragraphSelector, excludeSelectors } = message.payload || {};
      try {
        if (!rootSelector || !paragraphSelector) {
          sendResponse({ ok: false, error: "rootSelector and paragraphSelector required" });
          return true;
        }
        const root = document.querySelector(rootSelector);
        if (!root) {
          sendResponse({ ok: false, error: "Root not found: " + rootSelector });
          return true;
        }
        const excludeSet = new Set();
        (excludeSelectors || []).forEach(sel => {
          try {
            root.querySelectorAll(sel).forEach(el => excludeSet.add(el));
          } catch (_) {}
        });
        const parts = paragraphSelector.split(",").map(s => s.trim()).filter(Boolean);
        const nodes = [];
        parts.forEach(sel => {
          try {
            root.querySelectorAll(sel).forEach(el => nodes.push(el));
          } catch (_) {}
        });
        nodes.sort((a, b) => {
          const pos = a.compareDocumentPosition(b);
          if (pos & Node.DOCUMENT_POSITION_FOLLOWING) return -1;
          if (pos & Node.DOCUMENT_POSITION_PRECEDING) return 1;
          return 0;
        });
        const texts = [];
        for (const el of nodes) {
          let insideExcluded = false;
          let p = el;
          while (p && p !== root) {
            if (excludeSet.has(p)) { insideExcluded = true; break; }
            p = p.parentElement;
          }
          if (insideExcluded) continue;
          const t = (el.textContent || "").trim();
          if (t) texts.push(t);
        }
        const content = texts.join("\n\n").replace(/\n{3,}/g, "\n\n").trim();
        sendResponse({ ok: true, content });
      } catch (e) {
        sendResponse({ ok: false, error: (e && e.message) || String(e) });
      }
      return true;
    }

    if (message.type !== "STVS_EXTRACT") {
      return false;
    }

    try {
      const result = attemptExtraction();
      if (!result || !result.content || result.content.length < 1) {
        sendResponse({ ok: false, error: "No content could be extracted." });
        return false;
      }

      sendResponse({
        ok: true,
        data: {
          title: result.title || document.title || "",
          content: result.content,
          domSummary: result.domSummary || result.content,
          url: window.location.href,
          domain: window.location.hostname,
          publishedDate: result.publishedDate || null,
          modifiedDate: result.modifiedDate || null
        }
      });
    } catch (err) {
      sendResponse({
        ok: false,
        error: err && err.message ? err.message : String(err)
      });
    }

    return true;
  });
}

function attemptExtraction() {
  const cloned = document.cloneNode(true);
  const host = (typeof window !== "undefined" && window.location && window.location.hostname) ? window.location.hostname.toLowerCase() : "";

  // Try JSON-LD (schema.org Article / NewsArticle) first.
  const jsonLdArticle = extractFromJsonLd(cloned);
  if (jsonLdArticle && jsonLdArticle.content && jsonLdArticle.content.length >= STVS_MIN_LENGTH) {
    return {
      title: jsonLdArticle.title,
      content: jsonLdArticle.content,
      domSummary: jsonLdArticle.content
    };
  }

  // Reuters-specific: use data-testid="ArticleBody" and extract only paragraph-X + Heading (no <p> tags on Reuters).
  let extracted = "";
  let mainContainer = null;
  if (/reuters\.com$/i.test(host) || host === "www.reuters.com") {
    const reutersArticleBody = cloned.querySelector('[data-testid="ArticleBody"]');
    if (reutersArticleBody) {
      extracted = extractTextFromReutersArticleBody(reutersArticleBody);
      if (extracted && extracted.length >= STVS_MIN_LENGTH) {
        mainContainer = reutersArticleBody;
      }
    }
    if (!extracted || extracted.length < STVS_MIN_LENGTH) {
      const reutersRoot = findReutersArticleRoot(cloned);
      if (reutersRoot) {
        const fallback = extractTextFromArticleContainer(reutersRoot);
        if (fallback && fallback.length > (extracted || "").length) {
          extracted = fallback;
          mainContainer = reutersRoot;
        }
      }
    }
  }

  if (!extracted || extracted.length < STVS_MIN_LENGTH) {
    // Structure-driven container selection.
    mainContainer = findMainArticleContainer(cloned);
    if (mainContainer) {
      extracted = extractTextFromArticleContainer(mainContainer);
    }
  }
  
  // If extraction is too short, try fallback strategies.
  if (!extracted || extracted.length < STVS_MIN_LENGTH) {
    // Fallback 1: Try extracting from <article> directly (highest priority semantic tag).
    const directArticle = cloned.querySelector("article");
    if (directArticle && directArticle !== mainContainer) {
      const fallbackExtracted = extractTextFromArticleContainer(directArticle);
      if (fallbackExtracted && fallbackExtracted.length > extracted.length) {
        extracted = fallbackExtracted;
        mainContainer = directArticle;
      }
    }
    
    // Fallback 2: Try <main> tag.
    if (!extracted || extracted.length < STVS_MIN_LENGTH) {
      const directMain = cloned.querySelector("main");
      if (directMain && directMain !== mainContainer && directMain !== directArticle) {
        const fallbackExtracted = extractTextFromArticleContainer(directMain);
        if (fallbackExtracted && fallbackExtracted.length > extracted.length) {
          extracted = fallbackExtracted;
          mainContainer = directMain;
        }
      }
    }
    
    // Fallback 3: If still too short, try finding the largest content div within article/main.
    if (!extracted || extracted.length < STVS_MIN_LENGTH) {
      const articleOrMain = cloned.querySelector("article, main");
      if (articleOrMain) {
        // Find the div with the most paragraphs within article/main.
        const contentDivs = Array.from(articleOrMain.querySelectorAll("div"));
        let bestDiv = null;
        let maxParagraphs = 0;
        for (const div of contentDivs) {
          const pCount = div.querySelectorAll("p").length;
          const combo = ((div.className || "") + " " + (div.id || "")).toLowerCase();
          // Skip divs with non-content patterns.
          if (/(aside|sidebar|related|recommend|trending|popular|ad-|promo|social|share|comment|footer|header)/.test(combo)) {
            continue;
          }
          if (pCount > maxParagraphs) {
            maxParagraphs = pCount;
            bestDiv = div;
          }
        }
        if (bestDiv && maxParagraphs > 3) {
          const fallbackExtracted = extractTextFromArticleContainer(bestDiv);
          if (fallbackExtracted && fallbackExtracted.length > extracted.length) {
            extracted = fallbackExtracted;
          }
        }
      }
    }
    
    // Fallback 4: Aggressive extraction - find ALL paragraphs in article/main, then filter.
    if (!extracted || extracted.length < STVS_MIN_LENGTH) {
      const articleOrMain = cloned.querySelector("article, main");
      if (articleOrMain) {
        // Get all paragraphs, filter out those in excluded containers.
        const allParagraphs = Array.from(articleOrMain.querySelectorAll("p"));
        const filteredParagraphs = [];
        
        for (const p of allParagraphs) {
          // Check if paragraph is in an excluded container.
          let parent = p.parentElement;
          let isExcluded = false;
          let depth = 0;
          while (parent && depth < 10 && parent !== articleOrMain) {
            const parentCombo = (
              (parent.className || "") +
              " " +
              (parent.id || "") +
              " " +
              (parent.getAttribute("aria-label") || "") +
              " " +
              (parent.tagName || "")
            ).toLowerCase();
            
            if (/(aside|sidebar|related|recommend|trending|popular|ad-|promo|social|share|comment|footer|header|nav|top\s+stories)/.test(parentCombo) ||
                /^(ASIDE|NAV|HEADER|FOOTER)$/i.test(parent.tagName)) {
              isExcluded = true;
              break;
            }
            parent = parent.parentElement;
            depth++;
          }
          
          if (!isExcluded) {
            const text = (p.innerText || p.textContent || "").trim();
            if (text && text.length > 20) {
              filteredParagraphs.push(text);
            }
          }
        }
        
        if (filteredParagraphs.length > 3) {
          const aggressiveExtracted = normalizeWhitespace(filteredParagraphs.join("\n\n"));
          if (aggressiveExtracted.length > extracted.length) {
            extracted = aggressiveExtracted;
          }
        }
      }
    }
    
    // Fallback 5: Dateline-based: find a node containing (Reuters) - or (AP) — and use its container.
    if (!extracted || extracted.length < STVS_MIN_LENGTH) {
      const datelinePattern = /\(Reuters\)\s*[-—]|\(AP\)\s*[—\-]/;
      const doc = cloned.ownerDocument || cloned;
      const walker = doc.createTreeWalker(cloned.body, NodeFilter.SHOW_TEXT, null, false);
      let datelineNode = null;
      let n;
      while ((n = walker.nextNode())) {
        if (datelinePattern.test((n.textContent || "").trim())) {
          datelineNode = n;
          break;
        }
      }
      if (datelineNode) {
        let el = datelineNode.parentElement;
        while (el && el !== cloned.body) {
          const tag = (el.tagName || "").toLowerCase();
          const pCount = el.querySelectorAll("p").length;
          if ((tag === "article" || tag === "main" || pCount >= 5) && pCount > 0) {
            const fallbackExtracted = extractTextFromArticleContainer(el);
            if (fallbackExtracted && fallbackExtracted.length > (extracted || "").length) {
              extracted = fallbackExtracted;
              mainContainer = el;
            }
            break;
          }
          el = el.parentElement;
        }
      }
    }

    // Fallback 6: Last resort - try from body with lenient filtering.
    if (!extracted || extracted.length < STVS_MIN_LENGTH) {
      const bodyExtracted = extractTextFromArticleContainer(cloned.body);
      if (bodyExtracted && bodyExtracted.length > extracted.length) {
        extracted = bodyExtracted;
      }
    }
  }
  
  const title = inferTitle(cloned);
  const dates = extractDates(cloned);

  return {
    title,
    content: extracted,
    domSummary: extracted,
    publishedDate: dates.publishedDate,
    modifiedDate: dates.modifiedDate
  };
}

// --- JSON-LD helper --------------------------------------------------------

function extractFromJsonLd(doc) {
  // Try standard JSON-LD first.
  const scripts = Array.from(
    doc.querySelectorAll('script[type="application/ld+json"]')
  );

  let best = null;

  for (const script of scripts) {
    let json;
    try {
      json = JSON.parse(script.textContent);
    } catch {
      continue;
    }

    const items = Array.isArray(json) ? json : [json];

    for (const item of items) {
      if (!item || typeof item !== "object") continue;

      const type = item["@type"];
      if (!type) continue;
      const types = Array.isArray(type) ? type : [type];
      const hasArticleType = types.some(t =>
        String(t).toLowerCase().includes("article")
      );
      if (!hasArticleType) continue;

      const body = item.articleBody;
      if (!body || typeof body !== "string") continue;

      const cleanedBody = normalizeWhitespace(body);
      if (!cleanedBody) continue;

      const title =
        item.headline ||
        item.name ||
        inferTitle(doc);

      if (!best || cleanedBody.length > best.content.length) {
        best = {
          title: title || "",
          content: cleanedBody
        };
      }
    }
  }

  // Fallback: Try Next.js _NEXT_DATA_ script (common in modern React/Next.js sites).
  if (!best) {
    const nextDataScript = doc.querySelector('script[id="_NEXT_DATA_"], script[type="application/json"]');
    if (nextDataScript && nextDataScript.textContent) {
      try {
        const nextData = JSON.parse(nextDataScript.textContent);
        // Try to find article content in Next.js data structure.
        const findArticleBody = (obj, depth = 0) => {
          if (depth > 5) return null; // Prevent infinite recursion
          if (!obj || typeof obj !== "object") return null;
          
          if (typeof obj.articleBody === "string" && obj.articleBody.length > 100) {
            return obj.articleBody;
          }
          if (typeof obj.content === "string" && obj.content.length > 100) {
            return obj.content;
          }
          if (typeof obj.text === "string" && obj.text.length > 100) {
            return obj.text;
          }
          
          for (const key in obj) {
            if (key === "articleBody" || key === "content" || key === "text") {
              const val = obj[key];
              if (typeof val === "string" && val.length > 100) {
                return val;
              }
            }
            if (typeof obj[key] === "object") {
              const found = findArticleBody(obj[key], depth + 1);
              if (found) return found;
            }
          }
          return null;
        };
        
        const articleBody = findArticleBody(nextData);
        if (articleBody) {
          const cleanedBody = normalizeWhitespace(articleBody);
          if (cleanedBody && cleanedBody.length >= STVS_MIN_LENGTH) {
            best = {
              title: inferTitle(doc),
              content: cleanedBody
            };
          }
        }
      } catch (e) {
        // Ignore parsing errors for Next.js data
      }
    }
  }

  return best;
}

// --- Reuters-specific: extract from data-testid="ArticleBody" (paragraph-X, Heading only) ---

function extractTextFromReutersArticleBody(root) {
  if (!root) return "";
  const doc = root.ownerDocument || root;
  const blocks = [];
  const excludeTestIds = ["promo-box", "ContextWidget", "primary-gallery"];
  const nodes = root.querySelectorAll('[data-testid^="paragraph-"], [data-testid="Heading"]');
  for (const el of nodes) {
    const testId = (el.getAttribute("data-testid") || "").trim();
    if (!testId) continue;
    let skip = false;
    let p = el.parentElement;
    while (p && p !== root) {
      const pid = (p.getAttribute("data-testid") || "").trim();
      if (excludeTestIds.some(id => pid === id || pid.indexOf(id) === 0)) {
        skip = true;
        break;
      }
      p = p.parentElement;
    }
    if (skip) continue;
    const text = (el.textContent || el.innerText || "").trim();
    if (text) blocks.push(text);
  }
  return normalizeWhitespace(blocks.join("\n\n"));
}

// --- Reuters-specific: find article root by dateline (fallback when ArticleBody not present) ---

function findReutersArticleRoot(doc) {
  if (!doc || !doc.body) return null;
  // Match " (Reuters) - " or "(Reuters) -" with various dashes/unicode
  const datelinePattern = /\(Reuters\)\s*[-\u2013\u2014\u2015]/;
  const docForWalker = doc.ownerDocument || doc;
  const walker = docForWalker.createTreeWalker(doc.body, NodeFilter.SHOW_TEXT, null, false);
  let datelineNode = null;
  let n;
  while ((n = walker.nextNode())) {
    const t = (n.textContent || "").trim();
    if (t.length > 0 && (datelinePattern.test(t) || /^[A-Z][A-Za-z]+,\s*[A-Za-z]+\s+\d+\s*\(Reuters\)/.test(t))) {
      datelineNode = n;
      break;
    }
  }
  if (!datelineNode || !datelineNode.parentElement) return null;

  let best = null;
  let bestPCount = 0;
  let el = datelineNode.parentElement;
  while (el && el !== doc.body) {
    const pCount = el.querySelectorAll("p").length;
    if (pCount >= 5 && pCount > bestPCount) {
      bestPCount = pCount;
      best = el;
    }
    el = el.parentElement;
  }
  return best;
}

// --- Main container selection ----------------------------------------------

function findMainArticleContainer(doc) {
  const candidates = new Set();

  doc.querySelectorAll("article, main, [role='article'], [itemtype*='Article']").forEach(el => {
    candidates.add(el);
  });

  // Explicit article-body selectors (Reuters and similar sites).
  doc.querySelectorAll("[data-testid='article-body'], [data-testid='ArticleBody'], [data-testid='article-body-wrapper']").forEach(el => {
    candidates.add(el);
  });
  doc.querySelectorAll("div, section").forEach(el => {
    const tag = (el.tagName || "").toLowerCase();
    const cls = (el.className || "").toLowerCase();
    const id = (el.id || "").toLowerCase();
    const combo = cls + " " + id;
    // Match article-related patterns, but also accept <section> tags by default
    // (many sites use <section> for article containers without specific class names).
    if (tag === "section" || /(article|story|content|post|entry|body)\b/.test(combo)) {
      candidates.add(el);
    }
    // Reuters and others: explicit body class names.
    if (/\b(article[-_]?body|story[-_]?body|articlebody|storybody)\b/.test(combo)) {
      candidates.add(el);
    }
  });

  if (!candidates.size) {
    return null;
  }

  // Dateline pattern: real article body usually starts with (Reuters) - or (AP) — etc.
  const datelinePattern = /\(Reuters\)\s*-|\(AP\)\s*[—\-]|\(Reuters\)\s*—/;
  const footerOnlyPattern = /our\s+standards|thomson\s+reuters\s+trust\s+principles|sign\s+up\s+here|all\s+quotes\s+delayed|all\s+rights\s+reserved/i;

  let best = null;
  let bestScore = -Infinity;

  for (const el of candidates) {
    const combo = ((el.className || "") + " " + (el.id || "") + " " + (el.getAttribute("aria-label") || "")).toLowerCase();
    
    // Aggressively reject containers with non-content patterns.
    // Note: Don't reject "section" alone - many sites use <section> or class="section" for article containers.
    // Only reject if combined with non-content patterns like "related-section", "promo-section", etc.
    if (/(comment|comments|reply|responses|share|social|sidebar|nav|header|footer|cookie|consent|newsletter|signup|promo|related|recommend|suggested|tag|tags|topic|topics|most-read|most-viewed|mostread|mostviewed|ad-|advert|sponsor|trending|popular|stories|related-section|promo-section|ad-section|sidebar-section|footer-section|header-section)/.test(combo)) {
      continue;
    }

    const textContent = (el.textContent || "").trim();
    const textContentLower = textContent.toLowerCase();

    // Check if container contains stop keywords in its text content.
    if (/most\s+viewed|most\s+read|more\s+on\s+this\s+story|more\s+from\s+news|related\s+articles|recommended\s+for\s+you|trending\s+now|popular\s+stories/.test(textContentLower)) {
      const stopMatches = (textContentLower.match(/most\s+viewed|most\s+read|more\s+on\s+this\s+story|more\s+from\s+news|related\s+articles/g) || []).length;
      if (stopMatches > 0 && textContent.length < 5000) {
        continue;
      }
    }

    const paragraphs = el.querySelectorAll("p");
    const headings = el.querySelectorAll("h1, h2, h3");
    const figures = el.querySelectorAll("figure, figcaption");
    const forms = el.querySelectorAll("form");
    const lists = el.querySelectorAll("ul, ol");

    // Penalize containers with many lists (likely navigation or related articles).
    let score =
      paragraphs.length * 5 +
      headings.length * 1 -
      figures.length * 1 -
      forms.length * 3 -
      lists.length * 2;

    // Strong bonus: container contains news dateline (Reuters/AP style) → likely real article body.
    if (datelinePattern.test(textContent)) {
      score += 80;
    }
    // Penalize: looks like footer-only (Our Standards, Trust Principles, Sign up here, etc.) and small.
    if (footerOnlyPattern.test(textContent) && paragraphs.length <= 5 && textContent.length < 3500) {
      score -= 100;
    }

    if (score > bestScore) {
      bestScore = score;
      best = el;
    }
  }

  return best;
}

// --- Text extraction from container ----------------------------------------

function extractTextFromArticleContainer(root) {
  if (!root) return "";

  const doc = root.ownerDocument || document;
  
  // Track if we've seen AP/Reuters/etc. disclaimer (usually marks end of article).
  // Declare once at function scope for use in both extraction paths.
  let seenDisclaimer = false;
  
  // Stop keywords: if we encounter these, we've likely reached the end of the article.
  const STOP_KEYWORDS = [
    /^most\s+viewed$/i,
    /^most\s+read$/i,
    /^more\s+on\s+this\s+story$/i,
    /^more\s+from\s+news$/i,
    /^more\s+from\s+[a-z]+$/i,
    /^related\s+articles?$/i,
    /^recommended\s+for\s+you$/i,
    /^you\s+may\s+also\s+like$/i,
    /^trending\s+now$/i,
    /^popular\s+stories?$/i,
    /^top\s+stories$/i,
    /^related\s+topics?$/i,
    /^see\s+also$/i,
    /^more\s+on\s+this$/i,
    /^for\s+more\s+technology\s+news/i,
    /^for\s+more\s+science/i,
    /^sign\s+up\s+to\s+our/i,
    /^follow\s+us\s+on/i
  ];
  
  // Patterns that indicate we've hit related/summary content (not exact matches, but contained in text).
  const RELATED_CONTENT_PATTERNS = [
    /it's\s+been\s+days\s+since/i,
    /^the\s+mother\s+of\s+the\s+news\s+anchor/i,
    /^police\s+had\s+searched\s+the\s+home/i,
    /^the\s+video\s+was\s+released\s+by\s+the\s+fbi\s+more\s+than/i
  ];
  
  // Patterns that indicate comment/user-generated content (should stop extraction).
  const COMMENT_PATTERNS = [
    /^(it's\s+been\s+my\s+experience|i\s+don't|i\s+don['']t|i\s+have|i\s+think|i\s+believe|i\s+feel|we\s+all|we['']re|anyone\s+else\s+notice|um,\s+anyone)/i,
    /(down\s+votes?|echo\s+chamber|red\s+hat|tds|bot|muted|creeps)/i,
    /(it['']s\s+criminal|not\s+progress|green\s+new\s+deal|green\s+new\s+scam)/i,
    /(good\s+spelling\s+and\s+punctuation|no\s+one\s+wants|so\s+where\s+are\s+they)/i,
    /^(hurting\s+an\s+industry|just\s+the\s+construction|material\s+extraction|carbon\s+footprint)/i,
    /(windmill|windmills|ev['']s|electric\s+vehicles).*devastating/i
  ];
  
  // First, try a more aggressive approach: extract ALL paragraphs from the root,
  // then filter out non-content ones. This works better for complex DOM structures.
  const allParagraphs = root.querySelectorAll("p");
  if (allParagraphs.length > 5) {
    const blocks = [];
    let stopExtraction = false;
    
    for (const p of allParagraphs) {
      if (stopExtraction) break;
      
      // Skip if within excluded containers.
      if (p.closest) {
        const parentCombo = (
          (p.closest("aside, nav, header, footer")?.className || "") +
          " " +
          (p.closest("aside, nav, header, footer")?.id || "") +
          " " +
          (p.closest("aside, nav, header, footer")?.getAttribute("aria-label") || "")
        ).toLowerCase();
        
        if (/(top\s+stories|related|recommend|trending|popular|sidebar|ad-|promo|social|share|comment|footer|header|nav)/.test(parentCombo)) {
          continue;
        }
        
        // Skip if within media containers.
        if (p.closest("figure, figcaption, picture, [data-component*='media'], [data-component*='Video'], [class*='media'], [class*='gallery'], [class*='image'], [class*='photo']")) {
          continue;
        }
      }
      
      let text = (p.innerText || p.textContent || "").trim();
      if (!text || text.length < 10) continue;
      
      // Check if this is an AP/Reuters/etc. disclaimer (marks end of article).
      if (/(AP is solely responsible|Reuters is solely responsible|The Associated Press.*solely responsible|Find AP['']s standards)/i.test(text)) {
        seenDisclaimer = true;
        // Include the disclaimer itself, but mark that we've seen it.
        blocks.push(text);
        continue;
      }
      
      // If we've seen a disclaimer and the next paragraph looks like a comment, stop.
      if (seenDisclaimer) {
        // Check for comment patterns
        let looksLikeComment = false;
        for (const pattern of COMMENT_PATTERNS) {
          if (pattern.test(text)) {
            looksLikeComment = true;
            break;
          }
        }
        // Also check for opinionated short paragraphs
        if (!looksLikeComment && text.length < 200 && (
          /^[A-Z][^.!?]*!+\s*$/.test(text) ||
          /\b(NOT|NEVER|ALWAYS|EVERYONE|NO ONE)\b/.test(text) ||
          /^(hurting|just|um|good\s+spelling)/i.test(text)
        )) {
          looksLikeComment = true;
        }
        if (looksLikeComment) {
          stopExtraction = true;
          break;
        }
      }
      
      // ===== GENERIC DOM-BASED RECOMMENDATION DETECTION =====
      // Check if paragraph is inside an <a> tag (likely a link, not main content).
      if (p.closest("a")) {
        // Skip if it's a link, unless it's a very long paragraph (might be inline link in content).
        if (text.length < 200) {
          continue;
        }
      }
      
      // Check if paragraph is inside a list item (<li>) - likely a recommendation list item.
      const listItem = p.closest("li");
      if (listItem) {
        // Check if the list item contains a link - if so, it's likely a recommendation.
        if (listItem.querySelector("a")) {
          continue;  // Skip recommendation list items
        }
        // If it's a long paragraph in a list item without links, it might be legitimate content.
        // But be cautious - most list items with links are recommendations.
        if (text.length < 150) {
          continue;
        }
      }
      
      // Check if paragraph is inside a container with recommendation-related classes/ids.
      let parent = p.parentElement;
      let depth = 0;
      let isInRecommendationContainer = false;
      while (parent && depth < 5) {
        const parentCombo = (
          (parent.className || "") +
          " " +
          (parent.id || "") +
          " " +
          (parent.getAttribute("aria-label") || "") +
          " " +
          (parent.getAttribute("data-testid") || "") +
          " " +
          (parent.tagName || "")
        ).toLowerCase();
        
        // Check for recommendation-related patterns in parent containers.
        if (/(related|recommend|suggested|more\s+like|you\s+may\s+also|trending|popular|top\s+stories|most\s+viewed|most\s+read)/.test(parentCombo)) {
          isInRecommendationContainer = true;
          break;
        }
        
        // Check if parent is a list (<ul> or <ol>) that contains mostly links.
        if (/^(UL|OL)$/i.test(parent.tagName)) {
          const linksInList = parent.querySelectorAll("a").length;
          const itemsInList = parent.querySelectorAll("li").length;
          // If list has many links relative to items, it's likely a recommendation list.
          if (itemsInList > 0 && linksInList >= itemsInList * 0.7) {
            isInRecommendationContainer = true;
            break;
          }
        }
        
        parent = parent.parentElement;
        depth++;
      }
      
      if (isInRecommendationContainer) {
        continue;  // Skip paragraphs in recommendation containers
      }
      
      // Check for "More like this" or similar embedded recommendation headers.
      // Only skip if it's a short header-like text.
      if (text.length < 50 && /^(more\s+like\s+this|related\s+articles?|you\s+may\s+also\s+like|recommended\s+for\s+you)$/i.test(text)) {
        continue;  // Skip the header itself
      }
      // ===== END GENERIC DOM-BASED DETECTION =====
      
      // Check for stop keywords.
      for (const pattern of STOP_KEYWORDS) {
        if (pattern.test(text)) {
          stopExtraction = true;
          break;
        }
      }
      if (stopExtraction) break;
      
      // Check for related content patterns (summary-style repetitions).
      for (const pattern of RELATED_CONTENT_PATTERNS) {
        if (pattern.test(text)) {
          stopExtraction = true;
          break;
        }
      }
      if (stopExtraction) break;
      
      // Filter out common non-content patterns.
      if (/\((AP|Associated Press|Reuters|Getty Images|AFP|AP Photo|AP\/)[^)]*\)$/.test(text)) {
        continue;
      }
      if (/^FILE\s*-\s*/i.test(text) && /\((AP|Associated Press|Reuters|Getty Images|AFP)/.test(text)) {
        continue;
      }
      if (/^by\s+[A-Z][A-Za-z .'-]+$/i.test(text) && text.length < 100) {
        continue;
      }
      if (/^updated\s+\d{1,2}[:.]\d{2}\s*(am|pm)?/i.test(text)) {
        continue;
      }
      if (/^(share|comment|discuss|follow|subscribe|sign up|log in|register)$/i.test(text)) {
        continue;
      }
      if (text.length < 20 && !/[.!?]/.test(text) && /^(click|read|view|see|more|less|show|hide|expand|collapse)$/i.test(text)) {
        continue;
      }
      if (/^(additional\s+research|research\s+by|reporting\s+by|contributed\s+to\s+this\s+report|additional\s+reporting)/i.test(text) && text.length < 150) {
        continue;
      }
      
      blocks.push(text);
    }
    
    if (blocks.length > 3) {
      return normalizeWhitespace(blocks.join("\n\n"));
    }
  }

  // If the aggressive paragraph extraction above didn't work, fall back to TreeWalker.
  const walker = doc.createTreeWalker(
    root,
    NodeFilter.SHOW_ELEMENT,
    {
      acceptNode(el) {
        const tag = el.tagName;
        if (!tag) return NodeFilter.FILTER_SKIP;

        const combo = (
          (el.className || "") +
          " " +
          (el.id || "") +
          " " +
          (el.getAttribute("aria-label") || "") +
          " " +
          (el.getAttribute("data-testid") || "") +
          " " +
          (el.getAttribute("role") || "")
        ).toLowerCase();

        // Aggressively reject containers with non-content patterns.
        // Note: Don't reject "section" alone - many sites use <section> for article content.
        // Only reject if combined with non-content patterns.
        if (/(comment|comments|reply|responses|discussion|share|social|toolbar|sidebar|nav|breadcrumb|header|footer|cookie|consent|newsletter|signup|promo|related|recommend|suggested|tag|tags|topic|topics|most-read|most-viewed|mostread|mostviewed|ad-|advert|sponsor|byline|author|dateline|credit|caption|media|video|player|gallery|hero|callout|call-to-action|trending|popular|stories|related-section|promo-section|ad-section|sidebar-section|footer-section|header-section)/.test(combo)) {
          return NodeFilter.FILTER_REJECT;
        }

        // Check parent containers too - if any ancestor has these patterns, reject.
        let parent = el.parentElement;
        let depth = 0;
        while (parent && depth < 5) {
          const parentCombo = (
            (parent.className || "") +
            " " +
            (parent.id || "") +
            " " +
            (parent.getAttribute("aria-label") || "") +
            " " +
            (parent.getAttribute("data-testid") || "")
          ).toLowerCase();
          if (/(most-viewed|most-viewed|more-on-this-story|more-from-news|related-articles|recommended|trending|popular-stories|tag|tags|topic|topics)/.test(parentCombo)) {
            return NodeFilter.FILTER_REJECT;
          }
          parent = parent.parentElement;
          depth++;
        }

        if (/^(SCRIPT|STYLE|NOSCRIPT|IFRAME|SVG|CANVAS|VIDEO|AUDIO|FORM|UL|OL|LI|NAV|HEADER|FOOTER)$/i.test(tag)) {
          return NodeFilter.FILTER_REJECT;
        }

        // Handle ASIDE tags: reject if they have non-content aria-labels or classes.
        // But don't reject all <aside> tags - some sites use them for article content.
        if (/^ASIDE$/i.test(tag)) {
          const asideCombo = (
            (el.className || "") +
            " " +
            (el.id || "") +
            " " +
            (el.getAttribute("aria-label") || "")
          ).toLowerCase();
          // Reject aside elements with clear non-content indicators.
          if (/(top\s+stories|related|recommend|trending|popular|sidebar|ad-|promo|social|share|comment)/.test(asideCombo)) {
            return NodeFilter.FILTER_REJECT;
          }
          // Otherwise, skip it (allow traversal into it, but don't accept the aside itself).
          return NodeFilter.FILTER_SKIP;
        }

        // Accept SECTION tags if they don't have non-content patterns in their attributes.
        // Many sites use <section> to wrap article content.
        if (/^SECTION$/i.test(tag)) {
          // Check if this section has non-content patterns - if so, reject it.
          if (/(related|promo|ad-|sidebar|footer|header|nav|comment|share|social|recommend|trending|popular)/.test(combo)) {
            return NodeFilter.FILTER_REJECT;
          }
          // Otherwise, skip it (don't accept the section itself, but allow traversal into it).
          return NodeFilter.FILTER_SKIP;
        }

        // Only treat paragraph-like blocks as candidates.
        if (/^(P|H1|H2|H3|H4|H5|H6|BLOCKQUOTE|PRE)$/i.test(tag)) {
          return NodeFilter.FILTER_ACCEPT;
        }

        return NodeFilter.FILTER_SKIP;
      }
    },
    false
  );

  const blocks = [];
  let node;
  let stopExtraction = false;
  // Reset seenDisclaimer for TreeWalker path (or reuse the same variable from function scope)
  seenDisclaimer = false;

  while ((node = walker.nextNode()) && !stopExtraction) {
    // Skip if within media containers.
    if (
      node.closest &&
      node.closest(
        "figure, figcaption, picture, [data-component*='media'], [data-component*='Video'], [class*='media'], [class*='gallery'], [class*='image'], [class*='photo']"
      )
    ) {
      continue;
    }

    let text = (node.innerText || "").trim();
    if (!text) continue;

    // Check if this is an AP/Reuters/etc. disclaimer (marks end of article).
    if (/(AP is solely responsible|Reuters is solely responsible|The Associated Press.*solely responsible|Find AP['']s standards)/i.test(text)) {
      seenDisclaimer = true;
      // Include the disclaimer itself, but mark that we've seen it.
      blocks.push(text);
      continue;
    }
    
    // If we've seen a disclaimer and the next paragraph looks like a comment, stop.
    if (seenDisclaimer) {
      // Check for comment patterns
      let looksLikeComment = false;
      for (const pattern of COMMENT_PATTERNS) {
        if (pattern.test(text)) {
          looksLikeComment = true;
          break;
        }
      }
      // Also check for opinionated short paragraphs
      if (!looksLikeComment && text.length < 200 && (
        /^[A-Z][^.!?]*!+\s*$/.test(text) ||
        /\b(NOT|NEVER|ALWAYS|EVERYONE|NO ONE)\b/.test(text) ||
        /^(hurting|just|um|good\s+spelling)/i.test(text)
      )) {
        looksLikeComment = true;
      }
      if (looksLikeComment) {
        stopExtraction = true;
        break;
      }
    }

    // ===== GENERIC DOM-BASED RECOMMENDATION DETECTION =====
    // Check if node is inside an <a> tag (likely a link, not main content).
    if (node.closest && node.closest("a")) {
      // Skip if it's a link, unless it's a very long paragraph (might be inline link in content).
      if (text.length < 200) {
        continue;
      }
    }
    
    // Check if node is inside a list item (<li>) - likely a recommendation list item.
    const listItem = node.closest ? node.closest("li") : null;
    if (listItem) {
      // Check if the list item contains a link - if so, it's likely a recommendation.
      if (listItem.querySelector && listItem.querySelector("a")) {
        continue;  // Skip recommendation list items
      }
      // If it's a long paragraph in a list item without links, it might be legitimate content.
      // But be cautious - most list items with links are recommendations.
      if (text.length < 150) {
        continue;
      }
    }
    
    // Check if node is inside a container with recommendation-related classes/ids.
    let parent = node.parentElement;
    let depth = 0;
    let isInRecommendationContainer = false;
    while (parent && depth < 5) {
      const parentCombo = (
        (parent.className || "") +
        " " +
        (parent.id || "") +
        " " +
        (parent.getAttribute("aria-label") || "") +
        " " +
        (parent.getAttribute("data-testid") || "") +
        " " +
        (parent.tagName || "")
      ).toLowerCase();
      
      // Check for recommendation-related patterns in parent containers.
      if (/(related|recommend|suggested|more\s+like|you\s+may\s+also|trending|popular|top\s+stories|most\s+viewed|most\s+read)/.test(parentCombo)) {
        isInRecommendationContainer = true;
        break;
      }
      
      // Check if parent is a list (<ul> or <ol>) that contains mostly links.
      if (/^(UL|OL)$/i.test(parent.tagName)) {
        const linksInList = parent.querySelectorAll ? parent.querySelectorAll("a").length : 0;
        const itemsInList = parent.querySelectorAll ? parent.querySelectorAll("li").length : 0;
        // If list has many links relative to items, it's likely a recommendation list.
        if (itemsInList > 0 && linksInList >= itemsInList * 0.7) {
          isInRecommendationContainer = true;
          break;
        }
      }
      
      parent = parent.parentElement;
      depth++;
    }
    
    if (isInRecommendationContainer) {
      continue;  // Skip nodes in recommendation containers
    }
    
    // Check for "More like this" or similar embedded recommendation headers.
    // Only skip if it's a short header-like text.
    if (text.length < 50 && /^(more\s+like\s+this|related\s+articles?|you\s+may\s+also\s+like|recommended\s+for\s+you)$/i.test(text)) {
      continue;  // Skip the header itself
    }
    // ===== END GENERIC DOM-BASED DETECTION =====

    // Check for final stop keywords (newsletter signups, social media prompts, etc.).
    // These should always stop extraction, even if we were skipping recommendations.
    for (const pattern of STOP_KEYWORDS) {
      if (pattern.test(text)) {
        // Only stop if it's a final stop keyword (not "more like this" which we already handled).
        if (!/^(more\s+like\s+this|related\s+articles?)$/i.test(text)) {
          stopExtraction = true;
          break;
        }
      }
    }
    if (stopExtraction) break;
    
    // Check for related content patterns (summary-style repetitions).
    for (const pattern of RELATED_CONTENT_PATTERNS) {
      if (pattern.test(text)) {
        stopExtraction = true;
        break;
      }
    }
    if (stopExtraction) break;
    
    // Check for comment/user-generated content patterns.
    // If we've already collected substantial content and encounter comment-like text, stop.
    if (blocks.length > 5) {
      for (const pattern of COMMENT_PATTERNS) {
        if (pattern.test(text)) {
          stopExtraction = true;
          break;
        }
      }
      if (stopExtraction) break;
      
      // Also check for paragraphs that look like comments:
      // - Start with exclamation or strong opinion
      // - Contain multiple exclamation marks or all caps words
      // - Very short paragraphs with opinionated language
      if (text.length < 200 && (
        /^[A-Z][^.!?]*!+\s*$/.test(text) ||  // Short sentence ending with exclamation
        /\b(NOT|NEVER|ALWAYS|EVERYONE|NO ONE)\b/.test(text) ||  // All caps opinion words
        /^[A-Z][^.!?]*\?+\s*$/.test(text) && /(anyone|else|notice|where|they)/i.test(text)  // Question about comments
      )) {
        stopExtraction = true;
        break;
      }
    }
    if (stopExtraction) break;
    
    // Check if paragraph contains BBC program names (BBC Click, BBC TechXplore, etc.)
    // These are typically in related content sections at the END of articles.
    if (/BBC\s+(Click|TechXplore|Tech\s+Now)/i.test(text)) {
      // Only stop if we're near the end (have collected substantial content).
      if (blocks.length > 10) {
        stopExtraction = true;
        break;
      }
    }
    
    // Filter out common non-content patterns.
    if (/\((AP|Associated Press|Reuters|Getty Images|AFP|AP Photo|AP\/)[^)]*\)$/.test(text)) {
      continue;
    }
    if (/^FILE\s*-\s*/i.test(text) && /\((AP|Associated Press|Reuters|Getty Images|AFP)/.test(text)) {
      continue;
    }
    if (/^by\s+[A-Z][A-Za-z .'-]+$/i.test(text) && text.length < 100) {
      continue;
    }
    if (/^updated\s+\d{1,2}[:.]\d{2}\s*(am|pm)?/i.test(text)) {
      continue;
    }
    if (/^leer en español$/i.test(text)) {
      continue;
    }
    if (/^(share|comment|discuss|follow|subscribe|sign up|log in|register)$/i.test(text)) {
      continue;
    }
    if (text.length < 20 && !/[.!?]/.test(text) && /^(click|read|view|see|more|less|show|hide|expand|collapse)$/i.test(text)) {
      continue;
    }
    if (/^(additional\s+research|research\s+by|reporting\s+by|contributed\s+to\s+this\s+report|additional\s+reporting)/i.test(text) && text.length < 150) {
      continue;
    }

    blocks.push(text);
  }

  return normalizeWhitespace(blocks.join("\n\n"));
}

function inferTitle(doc) {
  const h1 = doc.querySelector("h1");
  if (h1 && h1.textContent) {
    return h1.textContent.trim();
  }
  const ogTitle = doc.querySelector('meta[property="og:title"], meta[name="og:title"]');
  if (ogTitle && ogTitle.content) {
    return ogTitle.content.trim();
  }
  const twitterTitle = doc.querySelector('meta[name="twitter:title"]');
  if (twitterTitle && twitterTitle.content) {
    return twitterTitle.content.trim();
  }
  return doc.title || "";
}

function extractDates(doc) {
  let publishedDate = null;
  let modifiedDate = null;

  // Try JSON-LD first (most reliable)
  const scripts = Array.from(doc.querySelectorAll('script[type="application/ld+json"]'));
  for (const script of scripts) {
    try {
      const json = JSON.parse(script.textContent);
      const items = Array.isArray(json) ? json : [json];
      for (const item of items) {
        if (item && typeof item === "object") {
          const type = item["@type"];
          if (type) {
            const types = Array.isArray(type) ? type : [type];
            const hasArticleType = types.some(t =>
              String(t).toLowerCase().includes("article")
            );
            if (hasArticleType) {
              if (item.datePublished && !publishedDate) {
                publishedDate = item.datePublished;
              }
              if (item.dateModified && !modifiedDate) {
                modifiedDate = item.dateModified;
              }
            }
          }
        }
      }
    } catch (e) {
      // Ignore parsing errors
    }
  }

  // Try meta tags (Open Graph, Schema.org, etc.)
  if (!publishedDate) {
    const metaPublished = doc.querySelector(
      'meta[property="article:published_time"], ' +
      'meta[name="article:published_time"], ' +
      'meta[property="og:article:published_time"], ' +
      'meta[itemprop="datePublished"], ' +
      'meta[name="date"], ' +
      'meta[name="publishdate"], ' +
      'meta[name="pubdate"]'
    );
    if (metaPublished && metaPublished.content) {
      publishedDate = metaPublished.content;
    }
  }

  if (!modifiedDate) {
    const metaModified = doc.querySelector(
      'meta[property="article:modified_time"], ' +
      'meta[name="article:modified_time"], ' +
      'meta[property="og:article:modified_time"], ' +
      'meta[itemprop="dateModified"], ' +
      'meta[name="modified"], ' +
      'meta[name="moddate"]'
    );
    if (metaModified && metaModified.content) {
      modifiedDate = metaModified.content;
    }
  }

  // Try <time> elements
  if (!publishedDate || !modifiedDate) {
    const timeElements = doc.querySelectorAll('time[datetime], time[pubdate]');
    for (const timeEl of timeElements) {
      const datetime = timeEl.getAttribute("datetime") || timeEl.getAttribute("pubdate");
      if (datetime) {
        if (!publishedDate && (timeEl.getAttribute("pubdate") || timeEl.closest("article"))) {
          publishedDate = datetime;
        }
        if (!modifiedDate && timeEl.getAttribute("itemprop") === "dateModified") {
          modifiedDate = datetime;
        }
      }
    }
  }

  // Return dates in their original format - no conversion or normalization
  return {
    publishedDate: publishedDate || null,
    modifiedDate: modifiedDate || null
  };
}

function normalizeWhitespace(text) {
  if (!text) return "";
  return text
    .replace(/\r\n/g, "\n")
    .replace(/\r/g, "\n")
    .replace(/[ \t]+/g, " ")
    .replace(/\n{3,}/g, "\n\n")
    .replace(/^[ \t]+/gm, "")
    .trim();
}
