Coverage for apps / recipes / services / search_parsers.py: 79%
183 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-12 10:49 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-12 10:49 +0000
1"""
2HTML/URL parsing helpers for recipe search.
4Extracted from RecipeSearch to keep search.py focused on orchestration.
5All functions are module-level (no class needed).
6"""
8import logging
9import re
10from typing import Optional
11from urllib.parse import urljoin, urlparse
13from bs4 import BeautifulSoup
15from apps.recipes.services.search import SearchResult
17logger = logging.getLogger(__name__)
20def find_link(element) -> Optional[tuple]:
21 """Find recipe link in an HTML element.
23 Returns:
24 Tuple of (link_element, url) if found, None otherwise.
25 """
26 link = element.find("a", href=True)
27 if not link:
28 link = element if element.name == "a" and element.get("href") else None
29 if not link:
30 return None
32 url = link.get("href", "")
33 if not url:
34 return None
36 return link, url
39def extract_title(element, link) -> str:
40 """Extract title from element with multiple fallback strategies.
42 Tries: heading elements, link text, title/aria-label attributes.
43 """
44 title_el = element.find(["h2", "h3", "h4", ".title", '[class*="title"]'])
45 if title_el:
46 title = title_el.get_text(strip=True)
47 if title:
48 return title
50 title = link.get_text(strip=True)
51 if title:
52 return title
54 return link.get("title", "") or link.get("aria-label", "")
57def extract_rating(title: str) -> tuple[str, Optional[int]]:
58 """Extract and strip rating count from title.
60 Handles patterns like "Recipe Name1,392Ratings".
62 Returns:
63 Tuple of (cleaned_title, rating_count).
64 """
65 rating_match = re.search(r"([\d,]+)\s*[Rr]atings?\s*$", title)
66 if not rating_match:
67 return title, None
69 rating_str = rating_match.group(1).replace(",", "")
70 try:
71 rating_count = int(rating_str)
72 cleaned_title = title[: rating_match.start()].strip()
73 return cleaned_title, rating_count
74 except ValueError:
75 return title, None
78def extract_description(element) -> str:
79 """Extract description from element."""
80 desc_el = element.find(["p", ".description", '[class*="description"]'])
81 if desc_el:
82 return desc_el.get_text(strip=True)[:200]
83 return ""
86def parse_srcset(srcset: str) -> list[tuple[str, int]]:
87 """Parse srcset string into (url, width) pairs.
89 Handles srcset format: "url1 100w, url2 200w"
90 URLs may contain commas (e.g. resize=93,84) so we split on
91 the width descriptor pattern rather than plain commas.
92 """
93 results = []
94 for match in re.finditer(r"(https?://\S+?)\s+(\d+)w", srcset):
95 url = match.group(1)
96 width = int(match.group(2))
97 results.append((url, width))
98 return results
101def _collect_srcset_strings(element) -> list[str]:
102 """Collect non-WebP srcset strings from <picture><source> and <img>."""
103 srcsets = []
104 picture = element.find("picture")
105 if picture:
106 for src in picture.find_all("source"):
107 srcset = src.get("srcset", "")
108 if srcset and "webp" not in (src.get("type") or "").lower():
109 srcsets.append(srcset)
110 img = element.find("img")
111 if img and img.get("srcset", ""):
112 srcsets.append(img["srcset"])
113 return srcsets
116def best_url_from_srcset(element, base_url: str) -> str:
117 """Extract the largest non-WebP image URL from srcset attributes."""
118 entries = [e for s in _collect_srcset_strings(element) for e in parse_srcset(s)]
119 if not entries:
120 return ""
121 best_url, _ = max(entries, key=lambda e: e[1])
122 return urljoin(base_url, best_url)
125def extract_image(element, base_url: str) -> str:
126 """Extract image URL with multiple fallback strategies.
128 Tries srcset (for largest image), then src, data-src, data-lazy-src.
129 Handles <picture><source srcset> patterns used by modern sites.
130 """
131 # Try to get the best image from srcset first (larger than thumbnail)
132 srcset_url = best_url_from_srcset(element, base_url)
133 if srcset_url:
134 return srcset_url
136 # Fallback to img src attributes
137 img = element.find("img")
138 if not img:
139 return ""
141 image_url = img.get("src") or img.get("data-src") or img.get("data-lazy-src", "")
142 if image_url:
143 return urljoin(base_url, image_url)
144 return ""
147def extract_result_from_element(
148 element,
149 host: str,
150 base_url: str,
151) -> Optional[SearchResult]:
152 """Extract search result data from an HTML element."""
153 # Find and validate link
154 link_result = find_link(element)
155 if not link_result:
156 return None
157 link, url = link_result
159 # Make URL absolute and get signal strength
160 url = urljoin(base_url, url)
161 url_signal = get_url_signal(url, host)
162 if url_signal in ("strong_exclude", "reject"):
163 return None
165 # Extract title
166 title = extract_title(element, link)
167 if not title:
168 return None
170 # Extract and strip rating from title
171 title, rating_count = extract_rating(title)
173 # Title may have become empty after stripping rating (QA-053)
174 if not title:
175 return None
177 # Filter non-recipe content by title (012-filter-search-results)
178 if not looks_like_recipe_title(title, url_signal):
179 logger.debug("Filtered non-recipe title: %s (%s)", title, url)
180 return None
182 image_url = extract_image(element, base_url)
183 description = extract_description(element)
185 # Field validation: neutral URL results must have both image AND description.
186 # Real recipe cards from search pages almost always have both.
187 # Editorial/article results often lack one or both.
188 if url_signal == "neutral" and (not image_url or not description):
189 logger.debug("Filtered neutral URL missing image or description: %s (%s)", title, url)
190 return None
192 return SearchResult(
193 url=url,
194 title=title[:200],
195 host=host,
196 image_url=image_url,
197 description=description,
198 rating_count=rating_count,
199 )
202def _parse_articles(
203 soup: BeautifulSoup,
204 host: str,
205 base_url: str,
206) -> list[SearchResult]:
207 """Strategy: extract results from <article> elements."""
208 _ext = extract_result_from_element
209 return [r for el in soup.find_all("article")[:30] if (r := _ext(el, host, base_url))]
212def _parse_cards(
213 soup: BeautifulSoup,
214 host: str,
215 base_url: str,
216) -> list[SearchResult]:
217 """Strategy: extract results from card-like div elements."""
218 _ext = extract_result_from_element
219 for sel in ('[class*="recipe-card"]', '[class*="card"]', '[class*="result"]', '[class*="item"]'):
220 results = [r for el in soup.select(sel)[:30] if (r := _ext(el, host, base_url))]
221 if results:
222 return results
223 return []
226def _parse_links(
227 soup: BeautifulSoup,
228 host: str,
229 base_url: str,
230) -> list[SearchResult]:
231 """Strategy: extract results from links that look like recipe URLs."""
232 results = []
233 for link in soup.find_all("a", href=True)[:100]:
234 url = urljoin(base_url, link.get("href", ""))
235 url_signal = get_url_signal(url, host)
236 if url_signal not in ("strong_include", "neutral"):
237 continue
238 title = link.get_text(strip=True)
239 if title and len(title) > 5 and looks_like_recipe_title(title, url_signal):
240 results.append(SearchResult(url=url, title=title[:200], host=host))
241 return results
244def fallback_parse(
245 soup: BeautifulSoup,
246 host: str,
247 base_url: str,
248) -> list[SearchResult]:
249 """Fallback parser for sites without a specific selector.
251 Tries article elements, card-like divs, then bare recipe links.
252 """
253 for strategy in (_parse_articles, _parse_cards, _parse_links):
254 results = strategy(soup, host, base_url)
255 if results:
256 return results
257 return []
260# Compiled patterns for looks_like_recipe_url (avoid recompiling per call)
261_RECIPE_PATTERNS = [
262 re.compile(p)
263 for p in [
264 r"/recipe[s]?/",
265 r"/dish/",
266 r"/food/",
267 r"/cooking/",
268 r"/\d+/",
269 r"-recipe/?$",
270 r"/a\d+/",
271 r"/food-cooking/",
272 ]
273]
275_EXCLUDE_PATTERNS = [
276 re.compile(p)
277 for p in [
278 r"/search",
279 r"/tag/",
280 r"/category/",
281 r"/author/",
282 r"/profile/",
283 r"/user/",
284 r"/about",
285 r"/contact",
286 r"/privacy",
287 r"/terms",
288 r"/newsletter",
289 r"/subscribe",
290 # Article/blog paths (QA-053)
291 r"/article/",
292 r"/articles/",
293 r"/blog/",
294 r"/post/",
295 r"/posts/",
296 r"/news/",
297 r"/story/",
298 r"/stories/",
299 r"/feature/",
300 r"/features/",
301 r"/guide/",
302 r"/guides/",
303 r"/review/",
304 r"/reviews/",
305 r"/roundup/",
306 r"/list/",
307 r"/listicle/",
308 # Video paths (QA-053)
309 r"/video/",
310 r"/videos/",
311 r"/watch/",
312 r"/watch\?",
313 r"/embed/",
314 r"/player/",
315 r"/clip/",
316 r"/clips/",
317 r"/episode/",
318 r"/episodes/",
319 r"/series/",
320 r"/show/",
321 r"/shows/",
322 r"/gallery/",
323 r"/galleries/",
324 r"/slideshow/",
325 r"/photo-gallery/",
326 # Index/listing pages (QA-053)
327 r"/seasons?(?:/|$)",
328 r"/cuisines?(?:/|$)",
329 r"/ingredients?(?:/|$)",
330 r"/collections?(?:/|$)",
331 r"/occasions?(?:/|$)",
332 r"/courses?(?:/|$)",
333 r"/diets?(?:/|$)",
334 r"/techniques?(?:/|$)",
335 r"/chefs?(?:/|$)",
336 r"/dishes(?:/|$)",
337 r"/menus?(?:/|$)",
338 r"/meal-plans?(?:/|$)",
339 ]
340]
343def _check_exclusion_patterns(path: str) -> bool:
344 """Return True if path matches any exclusion pattern."""
345 return any(pattern.search(path) for pattern in _EXCLUDE_PATTERNS)
348def _check_recipe_patterns(path: str) -> bool:
349 """Return True if path matches any recipe pattern."""
350 return any(pattern.search(path) for pattern in _RECIPE_PATTERNS)
353# Site-specific rules: host → callable returning signal or None
354_SITE_RULES: dict[str, callable] = {
355 "allrecipes.com": lambda path: "reject" if "/recipe/" not in path else None,
356}
359def _check_site_rules(host: str, path: str) -> Optional[str]:
360 """Apply site-specific rules. Returns a signal string or None."""
361 for domain, rule in _SITE_RULES.items():
362 if host == domain or host.endswith(f".{domain}"):
363 return rule(path)
364 return None
367def _check_path_heuristics(path: str) -> str:
368 """Apply heuristic fallbacks for paths with no strong signal.
370 Returns "neutral" or "reject".
371 """
372 segments = [s for s in path.split("/") if s]
373 if len(segments) >= 2 and len(path) > 20:
374 return "neutral"
375 if len(segments) == 1 and len(path) > 15 and path.count("-") >= 2:
376 return "neutral"
377 return "reject"
380def get_url_signal(url: str, host: str) -> str:
381 """Determine URL signal strength for recipe filtering.
383 Returns:
384 "strong_exclude" - URL matches exclusion patterns (articles, blogs, etc.)
385 "strong_include" - URL matches recipe patterns (/recipe/, /recipes/, etc.)
386 "neutral" - URL passes heuristics but has no strong signal
387 "reject" - URL fails all checks (wrong host, too short, etc.)
388 """
389 parsed = urlparse(url)
391 if host not in parsed.netloc:
392 return "reject"
394 path = parsed.path.lower()
396 if _check_exclusion_patterns(path):
397 return "strong_exclude"
399 site_signal = _check_site_rules(host, path)
400 if site_signal is not None:
401 return site_signal
403 if _check_recipe_patterns(path):
404 return "strong_include"
406 return _check_path_heuristics(path)
409def looks_like_recipe_url(url: str, host: str) -> bool:
410 """Check if a URL looks like a recipe detail page."""
411 signal = get_url_signal(url, host)
412 return signal in ("strong_include", "neutral")
415# Strong editorial patterns — always reject even if recipe words present
416# These are clearly article/editorial headlines, not recipe titles
417_STRONG_EDITORIAL_PATTERNS = [
418 re.compile(p, re.IGNORECASE)
419 for p in [
420 r"\bdeserves?\s+a\s+(?:gold|silver|bronze)\s+medal\b",
421 r"\bis\s+a\s+weeknight\s+winner\b",
422 r"\btop\s+trending\s+recipe\s+of\s+\d{4}\b",
423 r"\binsanely\s+awesome\b",
424 r"\bmost\s+beautiful\s+destination\b",
425 r"\bbest\s+time\s+to\s+book\b",
426 ]
427]
429# Mild editorial patterns — rejected unless recipe-context words present
430_EDITORIAL_TITLE_PATTERNS = [
431 re.compile(p, re.IGNORECASE)
432 for p in [
433 # Listicles: "Top 10...", "5 Best...", "7 Reasons..."
434 r"^(?:the\s+)?(?:top\s+)?\d+\s+(?:best|worst|things|reasons|ways|places|tips|tricks|destinations|restaurants|spots|cities|towns)\b",
435 # Travel/destination content
436 r"\btravel\s+guide\b",
437 r"\bbest\s+destinations?\b",
438 r"\bplaces?\s+to\s+visit\b",
439 r"\bwhere\s+to\s+(?:eat|go|stay|travel)\b",
440 r"\bbook\s+(?:your\s+)?(?:thanksgiving|christmas|holiday)\s+travel\b",
441 # Review/editorial
442 r"^review\s*:",
443 r"\b(?:product|book|restaurant|movie|hotel|app)\s+review\b",
444 # News/trending headers
445 r"^(?:news|breaking|update|trending)\s*:",
446 # Meta/navigation pages
447 r"^(?:about\s+us|contact\s+us|privacy\s+policy|terms\s+of|cookie\s+policy|subscribe|newsletter|sign\s+up|log\s+in)\b",
448 ]
449]
451# Recipe-context words that override mild editorial title patterns
452_RECIPE_CONTEXT_PATTERN = re.compile(
453 r"\b(?:recipe[s]?|cook(?:ing|ed)?|bake[ds]?|baking|roast(?:ed|ing)?|"
454 r"grill(?:ed|ing)?|how\s+to\s+(?:make|cook|bake|prepare)|homemade|"
455 r"ingredient[s]?|from\s+scratch|step.by.step|easy\s+(?:to\s+)?make)\b",
456 re.IGNORECASE,
457)
460def looks_like_recipe_title(title: str, url_signal: str) -> bool:
461 """Check if a search result title looks like recipe content.
463 Uses tiered resolution with URL signal strength:
464 - strong_include URLs: always pass (recipe URL overrides title concerns)
465 - neutral URLs: evaluated by title patterns
466 - Strong editorial patterns always reject (even with recipe words)
467 - Mild editorial patterns rejected unless recipe-context words present
468 """
469 if url_signal == "strong_include":
470 return True
472 title_stripped = title.strip()
473 if not title_stripped:
474 return False
476 # Strong editorial patterns always reject
477 for pattern in _STRONG_EDITORIAL_PATTERNS:
478 if pattern.search(title_stripped):
479 return False
481 # Mild editorial patterns rejected unless recipe-context words present
482 for pattern in _EDITORIAL_TITLE_PATTERNS:
483 if pattern.search(title_stripped):
484 if _RECIPE_CONTEXT_PATTERN.search(title_stripped):
485 return True
486 return False
488 return True