Coverage for apps / recipes / services / search_parsers.py: 72%
134 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 13:22 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 13:22 +0000
1"""
2HTML/URL parsing helpers for recipe search.
4Extracted from RecipeSearch to keep search.py focused on orchestration.
5All functions are module-level (no class needed).
6"""
8import logging
9import re
10from typing import Optional
11from urllib.parse import urljoin
13from bs4 import BeautifulSoup
15from apps.recipes.services.search import SearchResult
16from apps.recipes.services.search_classifiers import (
17 get_url_signal,
18 looks_like_recipe_title,
19)
21logger = logging.getLogger(__name__)
24def find_link(element) -> Optional[tuple]:
25 """Find recipe link in an HTML element.
27 Returns:
28 Tuple of (link_element, url) if found, None otherwise.
29 """
30 link = element.find("a", href=True)
31 if not link:
32 link = element if element.name == "a" and element.get("href") else None
33 if not link:
34 return None
36 url = link.get("href", "")
37 if not url:
38 return None
40 return link, url
43def extract_title(element, link) -> str:
44 """Extract title from element with multiple fallback strategies.
46 Uses separator=" " so nested metadata spans (e.g. prep-time) don't
47 bleed into the title without whitespace.
48 """
49 title_el = element.find(["h2", "h3", "h4", ".title", '[class*="title"]'])
50 if title_el:
51 title = title_el.get_text(separator=" ", strip=True)
52 if title:
53 return _strip_title_metadata(title)
55 title = link.get_text(separator=" ", strip=True)
56 if title:
57 return _strip_title_metadata(title)
59 return link.get("title", "") or link.get("aria-label", "")
62# Strips trailing time metadata, e.g. "30 mins", "1 hr 30 mins".
63_TRAILING_TIME_RE = re.compile(
64 r"\s+\d+\s*(?:hr?s?|hour[s]?|min(?:ute)?[s]?)(?:\s+\d+\s*(?:min(?:ute)?[s]?))?$",
65 re.IGNORECASE,
66)
69def _strip_title_metadata(title: str) -> str:
70 """Strip trailing time/duration metadata from a recipe card title."""
71 return _TRAILING_TIME_RE.sub("", title).strip()
74def extract_rating(title: str) -> tuple[str, Optional[int]]:
75 """Extract and strip rating count from title.
77 Handles patterns like "Recipe Name1,392Ratings".
79 Returns:
80 Tuple of (cleaned_title, rating_count).
81 """
82 rating_match = re.search(r"([\d,]+)\s*[Rr]atings?\s*$", title)
83 if not rating_match:
84 return title, None
86 rating_str = rating_match.group(1).replace(",", "")
87 try:
88 rating_count = int(rating_str)
89 cleaned_title = title[: rating_match.start()].strip()
90 return cleaned_title, rating_count
91 except ValueError:
92 return title, None
95def extract_description(element) -> str:
96 """Extract description from element."""
97 desc_el = element.find(["p", ".description", '[class*="description"]'])
98 if desc_el:
99 return desc_el.get_text(strip=True)[:200]
100 return ""
103def parse_srcset(srcset: str) -> list[tuple[str, int]]:
104 """Parse srcset string into (url, width) pairs.
106 Handles srcset format: "url1 100w, url2 200w"
107 URLs may contain commas (e.g. resize=93,84) so we split on
108 the width descriptor pattern rather than plain commas.
109 """
110 results = []
111 for match in re.finditer(r"(https?://\S+?)\s+(\d+)w", srcset):
112 url = match.group(1)
113 width = int(match.group(2))
114 results.append((url, width))
115 return results
118def _collect_srcset_strings(element) -> list[str]:
119 """Collect non-WebP srcset strings from <picture><source> and <img>."""
120 srcsets = []
121 picture = element.find("picture")
122 if picture:
123 for src in picture.find_all("source"):
124 srcset = src.get("srcset", "")
125 if srcset and "webp" not in (src.get("type") or "").lower():
126 srcsets.append(srcset)
127 img = element.find("img")
128 if img and img.get("srcset", ""):
129 srcsets.append(img["srcset"])
130 return srcsets
133def best_url_from_srcset(element, base_url: str) -> str:
134 """Extract the largest non-WebP image URL from srcset attributes."""
135 entries = [e for s in _collect_srcset_strings(element) for e in parse_srcset(s)]
136 if not entries:
137 return ""
138 best_url, _ = max(entries, key=lambda e: e[1])
139 return urljoin(base_url, best_url)
142def extract_image(element, base_url: str) -> str:
143 """Extract image URL with multiple fallback strategies.
145 Tries srcset (for largest image), then src, data-src, data-lazy-src.
146 Handles <picture><source srcset> patterns used by modern sites.
147 """
148 # Try to get the best image from srcset first (larger than thumbnail)
149 srcset_url = best_url_from_srcset(element, base_url)
150 if srcset_url:
151 return srcset_url
153 # Fallback to img src attributes
154 img = element.find("img")
155 if not img:
156 return ""
158 image_url = img.get("src") or img.get("data-src") or img.get("data-lazy-src", "")
159 if image_url:
160 return urljoin(base_url, image_url)
161 return ""
164def extract_result_from_element(
165 element,
166 host: str,
167 base_url: str,
168) -> Optional[SearchResult]:
169 """Extract search result data from an HTML element."""
170 # Find and validate link
171 link_result = find_link(element)
172 if not link_result:
173 return None
174 link, url = link_result
176 # Make URL absolute and get signal strength
177 url = urljoin(base_url, url)
178 url_signal = get_url_signal(url, host)
179 if url_signal in ("strong_exclude", "reject"):
180 return None
182 # Extract title
183 title = extract_title(element, link)
184 if not title:
185 return None
187 # Extract and strip rating from title
188 title, rating_count = extract_rating(title)
190 # Title may have become empty after stripping rating (QA-053)
191 if not title:
192 return None
194 # Filter non-recipe content by title (012-filter-search-results)
195 if not looks_like_recipe_title(title, url_signal):
196 logger.debug("Filtered non-recipe title: %s (%s)", title, url)
197 return None
199 image_url = extract_image(element, base_url)
200 description = extract_description(element)
202 # Neutral URLs must have both image AND description — recipe cards almost
203 # always do; editorial/article results often lack one or both.
204 if url_signal == "neutral" and (not image_url or not description):
205 logger.debug("Filtered neutral URL missing image or description: %s (%s)", title, url)
206 return None
208 return SearchResult(
209 url=url,
210 title=title[:200],
211 host=host,
212 image_url=image_url,
213 description=description,
214 rating_count=rating_count,
215 )
218def _parse_articles(
219 soup: BeautifulSoup,
220 host: str,
221 base_url: str,
222) -> list[SearchResult]:
223 """Strategy: extract results from <article> elements."""
224 _ext = extract_result_from_element
225 return [r for el in soup.find_all("article")[:30] if (r := _ext(el, host, base_url))]
228def _parse_cards(
229 soup: BeautifulSoup,
230 host: str,
231 base_url: str,
232) -> list[SearchResult]:
233 """Strategy: extract results from card-like div elements."""
234 _ext = extract_result_from_element
235 for sel in ('[class*="recipe-card"]', '[class*="card"]', '[class*="result"]', '[class*="item"]'):
236 results = [r for el in soup.select(sel)[:30] if (r := _ext(el, host, base_url))]
237 if results:
238 return results
239 return []
242def _parse_links(
243 soup: BeautifulSoup,
244 host: str,
245 base_url: str,
246) -> list[SearchResult]:
247 """Strategy: extract results from links that look like recipe URLs."""
248 results = []
249 for link in soup.find_all("a", href=True)[:100]:
250 url = urljoin(base_url, link.get("href", ""))
251 url_signal = get_url_signal(url, host)
252 if url_signal not in ("strong_include", "neutral"):
253 continue
254 title = link.get_text(strip=True)
255 if title and len(title) > 5 and looks_like_recipe_title(title, url_signal):
256 results.append(SearchResult(url=url, title=title[:200], host=host))
257 return results
260def fallback_parse(
261 soup: BeautifulSoup,
262 host: str,
263 base_url: str,
264) -> list[SearchResult]:
265 """Fallback parser for sites without a specific selector.
267 Tries article elements, card-like divs, then bare recipe links.
268 """
269 for strategy in (_parse_articles, _parse_cards, _parse_links):
270 results = strategy(soup, host, base_url)
271 if results:
272 return results
273 return []