Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 37 additions & 1 deletion backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
ListFilterType,
ScopeType,
Seed,
Profile,
)
from .utils import (
dt_now,
Expand Down Expand Up @@ -199,6 +200,14 @@ async def init_index(self):
[("oid", pymongo.ASCENDING), ("tags", pymongo.ASCENDING)]
)

await self.crawl_configs.create_index(
[
("oid", pymongo.ASCENDING),
("inactive", pymongo.ASCENDING),
("profileid", pymongo.ASCENDING),
]
)

await self.crawl_configs.create_index(
[("lastRun", pymongo.DESCENDING), ("modified", pymongo.DESCENDING)]
)
Expand Down Expand Up @@ -834,10 +843,37 @@ async def get_crawl_configs(
async def is_profile_in_use(self, profileid: UUID, org: Organization) -> bool:
"""return true/false if any active workflows exist with given profile"""
res = await self.crawl_configs.find_one(
{"profileid": profileid, "inactive": {"$ne": True}, "oid": org.id}
{"oid": org.id, "inactive": {"$ne": True}, "profileid": profileid}
)
return res is not None

async def mark_profiles_in_use(self, profiles: List[Profile], org: Organization):
"""mark which profiles are in use by querying and grouping crawlconfigs"""
profile_ids = [profile.id for profile in profiles]
cursor = self.crawl_configs.aggregate(
[
{
"$match": {
"oid": org.id,
"inactive": {"$ne": True},
"profileid": {"$in": profile_ids},
}
},
{"$group": {"_id": "$profileid", "count": {"$sum": 1}}},
]
)
results = await cursor.to_list()
in_use = set()
for res in results:
if res.get("count") > 0:
in_use.add(res.get("_id"))

for profile in profiles:
if profile.id in in_use:
profile.inUse = True

return profiles

async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]:
"""Return the id of currently running crawl for this config, if any"""
# crawls = await self.crawl_manager.list_running_crawls(cid=crawlconfig.id)
Expand Down
21 changes: 10 additions & 11 deletions backend/btrixcloud/crawlmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .utils import dt_now, date_to_str, scale_from_browser_windows
from .k8sapi import K8sAPI

from .models import StorageRef, CrawlConfig, BgJobType
from .models import StorageRef, CrawlConfig, BgJobType, ProfileBrowserMetadata


# ============================================================================
Expand All @@ -25,13 +25,14 @@
class CrawlManager(K8sAPI):
"""abstract crawl manager"""

# pylint: disable=too-many-arguments
# pylint: disable=too-many-arguments, too-many-locals
async def run_profile_browser(
self,
userid: str,
oid: str,
url: str,
storage: StorageRef,
crawler_channel: str,
crawler_image: str,
image_pull_policy: str,
baseprofile: str = "",
Expand Down Expand Up @@ -59,6 +60,7 @@ async def run_profile_browser(
"url": url,
"vnc_password": secrets.token_hex(16),
"expire_time": date_to_str(dt_now() + timedelta(seconds=30)),
"crawler_channel": crawler_channel,
"crawler_image": crawler_image,
"image_pull_policy": image_pull_policy,
"proxy_id": proxy_id or DEFAULT_PROXY_ID,
Expand Down Expand Up @@ -420,20 +422,17 @@ async def add_org_storage(
name=storage_secret, namespace=self.namespace, body=crawl_secret
)

async def get_profile_browser_metadata(self, browserid: str) -> dict[str, str]:
"""get browser profile labels"""
try:
browser = await self.get_profile_browser(browserid)

# pylint: disable=bare-except
except:
return {}
async def get_profile_browser_metadata(
self, browserid: str
) -> ProfileBrowserMetadata:
"""get browser profile metadata from labels"""
browser = await self.get_profile_browser(browserid)

metadata = browser["metadata"]["labels"]

metadata["committing"] = browser.get("spec", {}).get("committing")

return metadata
return ProfileBrowserMetadata(**metadata)

async def keep_alive_profile_browser(self, browserid: str, committing="") -> None:
"""update profile browser to not expire"""
Expand Down
28 changes: 22 additions & 6 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2455,6 +2455,25 @@ class Profile(BaseMongoModel):
inUse: bool = False


# ============================================================================
class ProfileBrowserMetadata(BaseModel):
"""Profile metadata stored in ProfileJob labels"""

browser: str

oid: str = Field(alias="btrix.org")
userid: UUID = Field(alias="btrix.user")
baseprofile: Optional[UUID] = Field(alias="btrix.baseprofile", default=None)
storage: str = Field(alias="btrix.storage")

profileid: UUID

proxyid: str = ""
crawlerChannel: str

committing: Optional[str] = None


# ============================================================================
class UrlIn(BaseModel):
"""Request to set url"""
Expand Down Expand Up @@ -2485,17 +2504,14 @@ class ProfileCreate(BaseModel):
browserid: str
name: str
description: Optional[str] = ""
crawlerChannel: str = "default"
proxyId: Optional[str] = None


# ============================================================================
class ProfileUpdate(BaseModel):
class ProfileUpdate(ProfileCreate):
"""Update existing profile with new browser profile or metadata only"""

browserid: Optional[str] = ""
name: str
description: Optional[str] = ""
# browserid optional if only updating metadata
browserid: str = ""


# ============================================================================
Expand Down
91 changes: 63 additions & 28 deletions backend/btrixcloud/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
SuccessResponseStorageQuota,
ProfilePingResponse,
ProfileBrowserGetUrlResponse,
ProfileBrowserMetadata,
)
from .utils import dt_now, str_to_date

Expand Down Expand Up @@ -130,6 +131,7 @@ async def create_new_browser(
str(org.id),
url=str(profile_launch.url),
storage=org.storage,
crawler_channel=profile_launch.crawlerChannel,
crawler_image=crawler_image,
image_pull_policy=image_pull_policy,
baseprofile=prev_profile_id,
Expand Down Expand Up @@ -172,11 +174,20 @@ async def get_profile_browser_url(
params["url"] = url
return params

async def ping_profile_browser(self, browserid: str) -> dict[str, Any]:
async def ping_profile_browser(
self, metadata: ProfileBrowserMetadata, org: Organization
) -> dict[str, Any]:
"""ping profile browser to keep it running"""
data = await self._send_browser_req(browserid, "/ping")
data = await self._send_browser_req(metadata.browser, "/ping")
origins = data.get("origins") or []

if metadata.baseprofile:
base = await self.get_profile(metadata.baseprofile, org)
for origin in base.origins:
if origin not in origins:
origins.append(origin)

return {"success": True, "origins": data.get("origins") or []}
return {"success": True, "origins": origins}

async def navigate_profile_browser(
self, browserid: str, urlin: UrlIn
Expand All @@ -190,21 +201,19 @@ async def navigate_profile_browser(

async def commit_to_profile(
self,
metadata: dict,
metadata: ProfileBrowserMetadata,
browser_commit: ProfileCreate,
org: Organization,
user: User,
existing_profile: Optional[Profile] = None,
) -> dict[str, Any]:
"""commit to profile async, returning if committed, or waiting"""
profileid = metadata.get("profileid")
if not profileid:
if not metadata.profileid:
raise HTTPException(status_code=400, detail="browser_not_valid")

self.orgs.can_write_data(org, include_time=False)

committing = metadata.get("committing")
if not committing:
if not metadata.committing:
self._run_task(
self.do_commit_to_profile(
metadata=metadata,
Expand All @@ -215,19 +224,19 @@ async def commit_to_profile(
)
)

if committing == "done":
if metadata.committing == "done":
await self.crawl_manager.delete_profile_browser(browser_commit.browserid)
return {
"added": True,
"id": profileid,
"id": str(metadata.profileid),
"storageQuotaReached": self.orgs.storage_quota_reached(org),
}

raise HTTPException(status_code=200, detail="waiting_for_browser")

async def do_commit_to_profile(
self,
metadata: dict,
metadata: ProfileBrowserMetadata,
browser_commit: ProfileCreate,
org: Organization,
user: User,
Expand All @@ -238,6 +247,8 @@ async def do_commit_to_profile(
try:
now = dt_now()

origins = []

if existing_profile:
profileid = existing_profile.id
created = existing_profile.created
Expand All @@ -246,8 +257,11 @@ async def do_commit_to_profile(
prev_file_size = (
existing_profile.resource.size if existing_profile.resource else 0
)

origins = existing_profile.origins

else:
profileid = UUID(metadata["profileid"])
profileid = metadata.profileid
created = now
created_by = user.id
created_by_name = user.name if user.name else user.email
Expand Down Expand Up @@ -275,10 +289,15 @@ async def do_commit_to_profile(
storage=org.storage,
)

baseid = metadata.get("btrix.baseprofile")
if baseid:
print("baseid", baseid)
baseid = UUID(baseid)
baseid = metadata.baseprofile

if origins:
for origin in data["origins"]:
if origin not in origins:
origins.append(origin)

else:
origins = data["origins"]

profile = Profile(
id=profileid,
Expand All @@ -290,13 +309,13 @@ async def do_commit_to_profile(
modified=now,
modifiedBy=user.id,
modifiedByName=user.name if user.name else user.email,
origins=data["origins"],
origins=origins,
resource=profile_file,
userid=UUID(metadata.get("btrix.user")),
userid=metadata.userid,
oid=org.id,
baseid=baseid,
crawlerChannel=browser_commit.crawlerChannel,
proxyId=browser_commit.proxyId,
crawlerChannel=metadata.crawlerChannel,
proxyId=metadata.proxyid,
)

await self.profiles.find_one_and_update(
Expand Down Expand Up @@ -455,6 +474,9 @@ async def list_profiles(
total = 0

profiles = [Profile.from_dict(res) for res in items]

profiles = await self.crawlconfigs.mark_profiles_in_use(profiles, org)

return profiles, total

async def get_profile(self, profileid: UUID, org: Organization) -> Profile:
Expand Down Expand Up @@ -611,15 +633,27 @@ def init_profiles_api(
org_crawl_dep = org_ops.org_crawl_dep

async def browser_get_metadata(
browserid: str, org: Organization = Depends(org_crawl_dep)
):
browserid: str, org: Organization
) -> ProfileBrowserMetadata:
# if await ops.redis.hget(f"br:{browserid}", "org") != str(org.id):
metadata = await crawl_manager.get_profile_browser_metadata(browserid)
if metadata.get("btrix.org") != str(org.id):
metadata = None
try:
metadata = await crawl_manager.get_profile_browser_metadata(browserid)
# pylint: disable=raise-missing-from
except Exception as e:
print(e)
raise HTTPException(status_code=400, detail="invalid_profile_browser")

if metadata.oid != str(org.id):
raise HTTPException(status_code=404, detail="no_such_browser")

return metadata

async def browser_metadata_dep(
browserid: str, org: Organization = Depends(org_crawl_dep)
):
return await browser_get_metadata(browserid, org)

async def browser_dep(browserid: str, org: Organization = Depends(org_crawl_dep)):
await browser_get_metadata(browserid, org)
return browserid
Expand Down Expand Up @@ -673,8 +707,6 @@ async def commit_browser_to_existing(
browserid=browser_commit.browserid,
name=browser_commit.name,
description=browser_commit.description or profile.description,
crawlerChannel=profile.crawlerChannel,
proxyId=profile.proxyId,
),
org=org,
user=user,
Expand Down Expand Up @@ -707,8 +739,11 @@ async def create_new(
return await ops.create_new_browser(org, user, profile_launch)

@router.post("/browser/{browserid}/ping", response_model=ProfilePingResponse)
async def ping_profile_browser(browserid: str = Depends(browser_dep)):
return await ops.ping_profile_browser(browserid)
async def ping_profile_browser(
metadata: ProfileBrowserMetadata = Depends(browser_metadata_dep),
org: Organization = Depends(org_crawl_dep),
):
return await ops.ping_profile_browser(metadata, org)

@router.post("/browser/{browserid}/navigate", response_model=SuccessResponse)
async def navigate_profile_browser(
Expand Down
Loading
Loading