Nessuna descrizione

matching.py 5.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. from __future__ import annotations
  2. from collections import Counter
  3. from dataclasses import dataclass
  4. from typing import Iterable, List, Sequence, Tuple
  5. from django.contrib.auth.models import User
  6. from django.db.models import Q
  7. from django.utils import timezone
  8. from api.models import IntroductionRequest, Opportunity, Profile
  9. def _now():
  10. return timezone.now()
  11. def tokenize_interests_text(text: str | None) -> set[str]:
  12. if not text:
  13. return set()
  14. raw = [t.strip().lower() for part in text.split(",") for t in part.split()] # type: ignore
  15. return {t for t in raw if t}
  16. def profile_interests(profile: Profile) -> set[str]:
  17. # Prefer taggit tags when available, fall back to comma-separated string
  18. try:
  19. tag_names = set(t.lower() for t in profile.tags.names())
  20. except Exception:
  21. tag_names = set()
  22. if tag_names:
  23. return tag_names
  24. return tokenize_interests_text(profile.interests)
  25. def tokenize_text(text: str | None) -> List[str]:
  26. if not text:
  27. return []
  28. clean = "".join(ch.lower() if ch.isalnum() or ch.isspace() else " " for ch in text)
  29. return [t for t in clean.split() if t]
  30. def jaccard(a: Iterable[str], b: Iterable[str]) -> float:
  31. sa, sb = set(a), set(b)
  32. if not sa and not sb:
  33. return 0.0
  34. inter = len(sa & sb)
  35. union = len(sa | sb)
  36. return inter / union if union else 0.0
  37. def cosine(a: Sequence[str], b: Sequence[str]) -> float:
  38. if not a or not b:
  39. return 0.0
  40. ca, cb = Counter(a), Counter(b)
  41. # dot product
  42. dot = sum(ca[t] * cb.get(t, 0) for t in ca)
  43. if dot == 0:
  44. return 0.0
  45. import math
  46. na = math.sqrt(sum(v * v for v in ca.values()))
  47. nb = math.sqrt(sum(v * v for v in cb.values()))
  48. denom = na * nb
  49. return (dot / denom) if denom else 0.0
  50. def time_decay(ts, half_life_days: float = 7.0) -> float:
  51. if not ts:
  52. return 0.0
  53. import math
  54. dt = (_now() - ts).total_seconds()
  55. half_life = half_life_days * 24 * 3600.0
  56. if dt <= 0:
  57. return 1.0
  58. return math.pow(0.5, dt / half_life)
  59. def neighbor_set(u: User) -> set[int]:
  60. # Users this user has interacted with via introduction requests
  61. ids = set(
  62. IntroductionRequest.objects.filter(Q(from_user=u) | Q(to_user=u))
  63. .values_list("from_user_id", "to_user_id")
  64. )
  65. # Flatten and drop self id
  66. flat = {i for pair in ids for i in pair if i and i != u.id}
  67. return flat
  68. def graph_similarity(u: User, v: User) -> float:
  69. nu, nv = neighbor_set(u), neighbor_set(v)
  70. return jaccard(nu, nv)
  71. def has_existing_request(u: User, v: User) -> bool:
  72. return IntroductionRequest.objects.filter(
  73. (Q(from_user=u, to_user=v) | Q(from_user=v, to_user=u))
  74. ).exists()
  75. @dataclass
  76. class RankedProfile:
  77. profile: Profile
  78. score: float
  79. def rank_users_for(user: User, k: int = 20) -> List[RankedProfile]:
  80. try:
  81. me_profile = Profile.objects.select_related("user").get(user=user)
  82. except Profile.DoesNotExist:
  83. me_profile = Profile(user=user) # empty defaults
  84. my_interests = profile_interests(me_profile)
  85. ranked: List[Tuple[float, Profile]] = []
  86. qs = (
  87. Profile.objects.select_related("user")
  88. .exclude(user=user)
  89. )
  90. for p in qs.iterator():
  91. if has_existing_request(user, p.user):
  92. continue
  93. jac = jaccard(my_interests, profile_interests(p))
  94. # Industry match: exact string match and non-empty
  95. industry_match = 1.0 if (me_profile.industry and me_profile.industry == p.industry) else 0.0
  96. ver = 1.0 if p.is_verified else 0.0
  97. gsim = graph_similarity(user, p.user)
  98. # Candidate activity recency: last accepted intro involving candidate
  99. last_acc = (
  100. IntroductionRequest.objects.filter(
  101. Q(from_user=p.user) | Q(to_user=p.user), is_accepted=True
  102. )
  103. .order_by("-created_at")
  104. .values_list("created_at", flat=True)
  105. .first()
  106. )
  107. rec = time_decay(last_acc) if last_acc else 0.0
  108. score = 0.45 * jac + 0.15 * industry_match + 0.20 * gsim + 0.10 * ver + 0.10 * rec
  109. if score > 0:
  110. ranked.append((score, p))
  111. ranked.sort(key=lambda t: t[0], reverse=True)
  112. top = ranked[:k]
  113. return [RankedProfile(profile=p, score=s) for s, p in top]
  114. @dataclass
  115. class RankedOpportunity:
  116. opportunity: Opportunity
  117. score: float
  118. def rank_opportunities_for(user: User, k: int = 20) -> List[RankedOpportunity]:
  119. try:
  120. me_profile = Profile.objects.select_related("user").get(user=user)
  121. except Profile.DoesNotExist:
  122. me_profile = Profile(user=user) # empty defaults
  123. query_tokens = tokenize_text((me_profile.bio or "") + " " + (me_profile.interests or "") + " " + (me_profile.industry or ""))
  124. ranked: List[Tuple[float, Opportunity]] = []
  125. for o in Opportunity.objects.all().iterator():
  126. doc_tokens = tokenize_text(o.title + " " + (o.description or ""))
  127. sim = cosine(query_tokens, doc_tokens)
  128. freshness = time_decay(o.created_at, half_life_days=10.0)
  129. score = 0.6 * sim + 0.4 * freshness
  130. if score > 0:
  131. ranked.append((score, o))
  132. ranked.sort(key=lambda t: t[0], reverse=True)
  133. top = ranked[:k]
  134. return [RankedOpportunity(opportunity=o, score=s) for s, o in top]