Coverage for apps / recipes / services / search_parsers.py: 72%

134 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-02 13:22 +0000

1""" 

2HTML/URL parsing helpers for recipe search. 

3 

4Extracted from RecipeSearch to keep search.py focused on orchestration. 

5All functions are module-level (no class needed). 

6""" 

7 

8import logging 

9import re 

10from typing import Optional 

11from urllib.parse import urljoin 

12 

13from bs4 import BeautifulSoup 

14 

15from apps.recipes.services.search import SearchResult 

16from apps.recipes.services.search_classifiers import ( 

17 get_url_signal, 

18 looks_like_recipe_title, 

19) 

20 

21logger = logging.getLogger(__name__) 

22 

23 

24def find_link(element) -> Optional[tuple]: 

25 """Find recipe link in an HTML element. 

26 

27 Returns: 

28 Tuple of (link_element, url) if found, None otherwise. 

29 """ 

30 link = element.find("a", href=True) 

31 if not link: 

32 link = element if element.name == "a" and element.get("href") else None 

33 if not link: 

34 return None 

35 

36 url = link.get("href", "") 

37 if not url: 

38 return None 

39 

40 return link, url 

41 

42 

43def extract_title(element, link) -> str: 

44 """Extract title from element with multiple fallback strategies. 

45 

46 Uses separator=" " so nested metadata spans (e.g. prep-time) don't 

47 bleed into the title without whitespace. 

48 """ 

49 title_el = element.find(["h2", "h3", "h4", ".title", '[class*="title"]']) 

50 if title_el: 

51 title = title_el.get_text(separator=" ", strip=True) 

52 if title: 

53 return _strip_title_metadata(title) 

54 

55 title = link.get_text(separator=" ", strip=True) 

56 if title: 

57 return _strip_title_metadata(title) 

58 

59 return link.get("title", "") or link.get("aria-label", "") 

60 

61 

62# Strips trailing time metadata, e.g. "30 mins", "1 hr 30 mins". 

63_TRAILING_TIME_RE = re.compile( 

64 r"\s+\d+\s*(?:hr?s?|hour[s]?|min(?:ute)?[s]?)(?:\s+\d+\s*(?:min(?:ute)?[s]?))?$", 

65 re.IGNORECASE, 

66) 

67 

68 

69def _strip_title_metadata(title: str) -> str: 

70 """Strip trailing time/duration metadata from a recipe card title.""" 

71 return _TRAILING_TIME_RE.sub("", title).strip() 

72 

73 

74def extract_rating(title: str) -> tuple[str, Optional[int]]: 

75 """Extract and strip rating count from title. 

76 

77 Handles patterns like "Recipe Name1,392Ratings". 

78 

79 Returns: 

80 Tuple of (cleaned_title, rating_count). 

81 """ 

82 rating_match = re.search(r"([\d,]+)\s*[Rr]atings?\s*$", title) 

83 if not rating_match: 

84 return title, None 

85 

86 rating_str = rating_match.group(1).replace(",", "") 

87 try: 

88 rating_count = int(rating_str) 

89 cleaned_title = title[: rating_match.start()].strip() 

90 return cleaned_title, rating_count 

91 except ValueError: 

92 return title, None 

93 

94 

95def extract_description(element) -> str: 

96 """Extract description from element.""" 

97 desc_el = element.find(["p", ".description", '[class*="description"]']) 

98 if desc_el: 

99 return desc_el.get_text(strip=True)[:200] 

100 return "" 

101 

102 

103def parse_srcset(srcset: str) -> list[tuple[str, int]]: 

104 """Parse srcset string into (url, width) pairs. 

105 

106 Handles srcset format: "url1 100w, url2 200w" 

107 URLs may contain commas (e.g. resize=93,84) so we split on 

108 the width descriptor pattern rather than plain commas. 

109 """ 

110 results = [] 

111 for match in re.finditer(r"(https?://\S+?)\s+(\d+)w", srcset): 

112 url = match.group(1) 

113 width = int(match.group(2)) 

114 results.append((url, width)) 

115 return results 

116 

117 

118def _collect_srcset_strings(element) -> list[str]: 

119 """Collect non-WebP srcset strings from <picture><source> and <img>.""" 

120 srcsets = [] 

121 picture = element.find("picture") 

122 if picture: 

123 for src in picture.find_all("source"): 

124 srcset = src.get("srcset", "") 

125 if srcset and "webp" not in (src.get("type") or "").lower(): 

126 srcsets.append(srcset) 

127 img = element.find("img") 

128 if img and img.get("srcset", ""): 

129 srcsets.append(img["srcset"]) 

130 return srcsets 

131 

132 

133def best_url_from_srcset(element, base_url: str) -> str: 

134 """Extract the largest non-WebP image URL from srcset attributes.""" 

135 entries = [e for s in _collect_srcset_strings(element) for e in parse_srcset(s)] 

136 if not entries: 

137 return "" 

138 best_url, _ = max(entries, key=lambda e: e[1]) 

139 return urljoin(base_url, best_url) 

140 

141 

142def extract_image(element, base_url: str) -> str: 

143 """Extract image URL with multiple fallback strategies. 

144 

145 Tries srcset (for largest image), then src, data-src, data-lazy-src. 

146 Handles <picture><source srcset> patterns used by modern sites. 

147 """ 

148 # Try to get the best image from srcset first (larger than thumbnail) 

149 srcset_url = best_url_from_srcset(element, base_url) 

150 if srcset_url: 

151 return srcset_url 

152 

153 # Fallback to img src attributes 

154 img = element.find("img") 

155 if not img: 

156 return "" 

157 

158 image_url = img.get("src") or img.get("data-src") or img.get("data-lazy-src", "") 

159 if image_url: 

160 return urljoin(base_url, image_url) 

161 return "" 

162 

163 

164def extract_result_from_element( 

165 element, 

166 host: str, 

167 base_url: str, 

168) -> Optional[SearchResult]: 

169 """Extract search result data from an HTML element.""" 

170 # Find and validate link 

171 link_result = find_link(element) 

172 if not link_result: 

173 return None 

174 link, url = link_result 

175 

176 # Make URL absolute and get signal strength 

177 url = urljoin(base_url, url) 

178 url_signal = get_url_signal(url, host) 

179 if url_signal in ("strong_exclude", "reject"): 

180 return None 

181 

182 # Extract title 

183 title = extract_title(element, link) 

184 if not title: 

185 return None 

186 

187 # Extract and strip rating from title 

188 title, rating_count = extract_rating(title) 

189 

190 # Title may have become empty after stripping rating (QA-053) 

191 if not title: 

192 return None 

193 

194 # Filter non-recipe content by title (012-filter-search-results) 

195 if not looks_like_recipe_title(title, url_signal): 

196 logger.debug("Filtered non-recipe title: %s (%s)", title, url) 

197 return None 

198 

199 image_url = extract_image(element, base_url) 

200 description = extract_description(element) 

201 

202 # Neutral URLs must have both image AND description — recipe cards almost 

203 # always do; editorial/article results often lack one or both. 

204 if url_signal == "neutral" and (not image_url or not description): 

205 logger.debug("Filtered neutral URL missing image or description: %s (%s)", title, url) 

206 return None 

207 

208 return SearchResult( 

209 url=url, 

210 title=title[:200], 

211 host=host, 

212 image_url=image_url, 

213 description=description, 

214 rating_count=rating_count, 

215 ) 

216 

217 

218def _parse_articles( 

219 soup: BeautifulSoup, 

220 host: str, 

221 base_url: str, 

222) -> list[SearchResult]: 

223 """Strategy: extract results from <article> elements.""" 

224 _ext = extract_result_from_element 

225 return [r for el in soup.find_all("article")[:30] if (r := _ext(el, host, base_url))] 

226 

227 

228def _parse_cards( 

229 soup: BeautifulSoup, 

230 host: str, 

231 base_url: str, 

232) -> list[SearchResult]: 

233 """Strategy: extract results from card-like div elements.""" 

234 _ext = extract_result_from_element 

235 for sel in ('[class*="recipe-card"]', '[class*="card"]', '[class*="result"]', '[class*="item"]'): 

236 results = [r for el in soup.select(sel)[:30] if (r := _ext(el, host, base_url))] 

237 if results: 

238 return results 

239 return [] 

240 

241 

242def _parse_links( 

243 soup: BeautifulSoup, 

244 host: str, 

245 base_url: str, 

246) -> list[SearchResult]: 

247 """Strategy: extract results from links that look like recipe URLs.""" 

248 results = [] 

249 for link in soup.find_all("a", href=True)[:100]: 

250 url = urljoin(base_url, link.get("href", "")) 

251 url_signal = get_url_signal(url, host) 

252 if url_signal not in ("strong_include", "neutral"): 

253 continue 

254 title = link.get_text(strip=True) 

255 if title and len(title) > 5 and looks_like_recipe_title(title, url_signal): 

256 results.append(SearchResult(url=url, title=title[:200], host=host)) 

257 return results 

258 

259 

260def fallback_parse( 

261 soup: BeautifulSoup, 

262 host: str, 

263 base_url: str, 

264) -> list[SearchResult]: 

265 """Fallback parser for sites without a specific selector. 

266 

267 Tries article elements, card-like divs, then bare recipe links. 

268 """ 

269 for strategy in (_parse_articles, _parse_cards, _parse_links): 

270 results = strategy(soup, host, base_url) 

271 if results: 

272 return results 

273 return [] 

← Back to Dashboard