diff --git a/independent-publisher-connectors/YouTube Transcripts/ConnectorPackage.zip b/independent-publisher-connectors/YouTube Transcripts/ConnectorPackage.zip new file mode 100644 index 0000000000..b2eee8d0a5 Binary files /dev/null and b/independent-publisher-connectors/YouTube Transcripts/ConnectorPackage.zip differ diff --git a/independent-publisher-connectors/YouTube Transcripts/apiDefinition.swagger.json b/independent-publisher-connectors/YouTube Transcripts/apiDefinition.swagger.json new file mode 100644 index 0000000000..dbada3c3bc --- /dev/null +++ b/independent-publisher-connectors/YouTube Transcripts/apiDefinition.swagger.json @@ -0,0 +1,246 @@ +{ + "swagger": "2.0", + "info": { + "title": "YouTube Transcript", + "description": "A custom service to retrieve transcripts from YouTube videos using the internal YouTube API.", + "version": "1.0.0", + "contact": { + "name": "Troy Taylor", + "url": "https://github.com/troystaylor/PowerPlatformConnectors", + "email": "troy@troystaylor.com" + } + }, + "host": "www.youtube.com", + "basePath": "/", + "schemes": [ + "https" + ], + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "securityDefinitions": {}, + "security": [], + "paths": { + "/youtubei/v1/get_transcript": { + "post": { + "operationId": "GetTranscript", + "summary": "Get Video Transcript", + "description": "Retrieves and transforms the transcript for a specified YouTube video into a clean, Power Platform-friendly format. Simply provide the YouTube video ID and the custom code handles all complex parameter generation automatically.", + "x-ms-summary": "Get Video Transcript", + "x-ms-visibility": "important", + "parameters": [ + { + "name": "body", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/TranscriptRequest" + }, + "x-ms-summary": "Request Body", + "description": "The request body containing video ID and parameters." + } + ], + "responses": { + "200": { + "description": "Response from YouTube transcript API - may contain success or error information.", + "x-ms-summary": "API Response", + "schema": { + "$ref": "#/definitions/TranscriptResponse" + } + } + } + } + } + }, + "definitions": { + "TranscriptRequest": { + "type": "object", + "required": [ + "externalVideoId" + ], + "properties": { + "context": { + "$ref": "#/definitions/RequestContext" + }, + "externalVideoId": { + "type": "string", + "description": "The YouTube video ID (11 characters) - found at the end of YouTube video URLs (e.g., youtube.com/watch?v=DC2p3kFjcK0)", + "x-ms-summary": "YouTube Video ID", + "x-ms-visibility": "important", + "pattern": "^[a-zA-Z0-9_-]{11}$", + "x-ms-test-value": "DC2p3kFjcK0" + } + } + }, + "RequestContext": { + "type": "object", + "description": "Context information for the API request.", + "x-ms-summary": "Request Context", + "required": [ + "client" + ], + "properties": { + "client": { + "$ref": "#/definitions/ClientInfo" + } + } + }, + "ClientInfo": { + "type": "object", + "description": "Client details for the API request.", + "x-ms-summary": "Client Information", + "required": [ + "clientName", + "clientVersion" + ], + "properties": { + "clientName": { + "type": "string", + "description": "The client name identifier (static value - does not impact request).", + "x-ms-summary": "Client Name", + "default": "WEB", + "x-ms-visibility": "internal" + }, + "clientVersion": { + "type": "string", + "description": "The client version string (static value - does not impact request).", + "x-ms-summary": "Client Version", + "default": "2.20250923.08.00", + "x-ms-visibility": "internal" + } + } + }, + "TranscriptResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "x-ms-summary": "Success", + "description": "Indicates if the transcript was successfully retrieved." + }, + "segments": { + "type": "array", + "x-ms-summary": "Transcript Segments", + "description": "Array of transcript segments with text and timing information.", + "items": { + "$ref": "#/definitions/TranscriptSegment" + } + }, + "totalSegments": { + "type": "integer", + "format": "int32", + "x-ms-summary": "Total Segments", + "description": "Total number of transcript segments." + }, + "totalDurationMs": { + "type": "integer", + "format": "int32", + "x-ms-summary": "Total Duration (ms)", + "description": "Total video duration in milliseconds." + }, + "totalDurationFormatted": { + "type": "string", + "x-ms-summary": "Total Duration", + "description": "Total video duration in human-readable format (e.g., '4:36')" + }, + "fullTranscript": { + "type": "string", + "x-ms-summary": "Full Transcript", + "description": "Complete transcript text as a single string." + }, + "language": { + "type": "string", + "x-ms-summary": "Language", + "description": "Language of the transcript (e.g., 'English (auto-generated)')" + }, + "processedAt": { + "type": "string", + "x-ms-summary": "Processed At", + "description": "ISO 8601 timestamp when the response was processed." + }, + "error": { + "type": "string", + "x-ms-summary": "Error Message", + "description": "Error message if success is false." + } + } + }, + "TranscriptSegment": { + "type": "object", + "properties": { + "text": { + "type": "string", + "x-ms-summary": "Text", + "description": "The transcript text for this segment (cleaned)." + }, + "startMs": { + "type": "integer", + "format": "int32", + "x-ms-summary": "Start Time (ms)", + "description": "Start time in milliseconds." + }, + "endMs": { + "type": "integer", + "format": "int32", + "x-ms-summary": "End Time (ms)", + "description": "End time in milliseconds." + }, + "durationMs": { + "type": "integer", + "format": "int32", + "x-ms-summary": "Duration (ms)", + "description": "Duration of this segment in milliseconds." + }, + "startTime": { + "type": "string", + "x-ms-summary": "Start Time (Original)", + "description": "Original YouTube time format (e.g., '3:24')" + }, + "startTimeFormatted": { + "type": "string", + "x-ms-summary": "Start Time (Formatted)", + "description": "Formatted start time (e.g., '3:24')" + }, + "endTimeFormatted": { + "type": "string", + "x-ms-summary": "End Time (Formatted)", + "description": "Formatted end time (e.g., '3:26')" + }, + "durationFormatted": { + "type": "string", + "x-ms-summary": "Duration (Formatted)", + "description": "Formatted duration (e.g., '0:02')" + }, + "wordCount": { + "type": "integer", + "format": "int32", + "x-ms-summary": "Word Count", + "description": "Number of words in this segment." + }, + "characterCount": { + "type": "integer", + "format": "int32", + "x-ms-summary": "Character Count", + "description": "Number of characters in this segment." + } + } + } + }, + "x-ms-connector-metadata": [ + { + "propertyName": "Website", + "propertyValue": "https://www.youtube.com" + }, + { + "propertyName": "Privacy policy", + "propertyValue": "https://policies.google.com/privacy" + }, + { + "propertyName": "Categories", + "propertyValue": "Content and Files;Productivity" + } + ] +} \ No newline at end of file diff --git a/independent-publisher-connectors/YouTube Transcripts/apiProperties.json b/independent-publisher-connectors/YouTube Transcripts/apiProperties.json new file mode 100644 index 0000000000..597391cacf --- /dev/null +++ b/independent-publisher-connectors/YouTube Transcripts/apiProperties.json @@ -0,0 +1,12 @@ +{ + "properties": { + "iconBrandColor": "#da3b01", + "capabilities": [ + "actions" + ], + "publisher": "Troy Taylor", + "stackOwner": "YouTube", + "connectionParameters": {}, + "policyTemplateInstances": [] + } +} \ No newline at end of file diff --git a/independent-publisher-connectors/YouTube Transcripts/readme.md b/independent-publisher-connectors/YouTube Transcripts/readme.md new file mode 100644 index 0000000000..3ac4c0a475 --- /dev/null +++ b/independent-publisher-connectors/YouTube Transcripts/readme.md @@ -0,0 +1,21 @@ +# YouTube Transcript (Independent Publisher) +A custom service to retrieve transcripts from YouTube videos using the internal YouTube service. + +## Publisher: Troy Taylor + +## Prerequisites +There are no prerequisites needed for this service. + +## Obtaining Credentials +This connector does not require authentication. YouTube transcripts are accessed through public API endpoints. + +## Supported Operations +### Get Video Transcript +Retrieves and transforms the transcript for a specified YouTube video into a clean, Power Platform-friendly format with enhanced metadata and text processing. + +## Known Issues and Limitations +- Transcripts must be available for the video (auto-generated or manually uploaded by creator) +- Only works with public YouTube videos +- Uses YouTube's internal API which may change without notice +- Custom code transforms complex responses into simplified Power Platform format +- Please ensure compliance with YouTube's Terms of Service \ No newline at end of file diff --git a/independent-publisher-connectors/YouTube Transcripts/script.csx b/independent-publisher-connectors/YouTube Transcripts/script.csx new file mode 100644 index 0000000000..1f6aad6350 --- /dev/null +++ b/independent-publisher-connectors/YouTube Transcripts/script.csx @@ -0,0 +1,585 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Net; +using System.Net.Http; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using Newtonsoft.Json; +using Newtonsoft.Json.Linq; + +public class Script : ScriptBase +{ + // Cache compiled regex for better performance + private static readonly Regex WhitespaceRegex = new Regex(@"\s+", RegexOptions.Compiled); + private static readonly Regex SoundEffectsRegex = new Regex(@"\[.*?\]", RegexOptions.Compiled); + private static readonly Regex BackgroundNoiseRegex = new Regex(@"\(.*?\)", RegexOptions.Compiled); + + // Constants for better maintainability + private const string DefaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"; + private const string DefaultLanguage = "English (auto-generated)"; + private const int MaxSearchDepth = 5; + + public override async Task ExecuteAsync() + { + this.Context.Logger.LogInformation("YouTube Transcript: Starting processing"); + try + { + var modifiedRequest = await ProcessRequest(this.Context.Request); + var response = await this.Context.SendAsync(modifiedRequest, this.CancellationToken).ConfigureAwait(false); + + this.Context.Logger.LogInformation($"YouTube API response: {response.StatusCode}"); + + if (!response.IsSuccessStatusCode) + { + var errorContent = await response.Content.ReadAsStringAsync().ConfigureAwait(false); + this.Context.Logger.LogWarning($"API error: {response.StatusCode} - {errorContent}"); + return CreateErrorResponse(response.StatusCode, "YouTube API request failed"); + } + + var responseString = await response.Content.ReadAsStringAsync().ConfigureAwait(false); + var transformedResponse = TransformYouTubeResponse(responseString); + response.Content = CreateJsonContent(transformedResponse); + + this.Context.Logger.LogInformation("Response transformation completed successfully"); + return response; + } + catch (Exception ex) + { + this.Context.Logger.LogError($"Processing error: {ex.Message}"); + return CreateErrorResponse(HttpStatusCode.InternalServerError, $"Processing error: {ex.Message}"); + } + } + + private async Task ProcessRequest(HttpRequestMessage originalRequest) + { + if (originalRequest.Content == null) + return originalRequest; + + try + { + var contentString = await originalRequest.Content.ReadAsStringAsync().ConfigureAwait(false); + var requestJson = JObject.Parse(contentString); + + var videoId = requestJson["externalVideoId"]?.ToString(); + var existingParams = requestJson["params"]?.ToString(); + + // Generate params if needed + if (ShouldGenerateParams(existingParams) && !string.IsNullOrWhiteSpace(videoId)) + { + requestJson["params"] = GenerateYouTubeParams(videoId); + } + + // Ensure context is set + EnsureContext(requestJson); + + var newRequest = CreateRequestWithHeaders(originalRequest, requestJson.ToString(), videoId); + return newRequest; + } + catch (Exception ex) + { + this.Context.Logger.LogError($"Request processing error: {ex.Message}"); + return originalRequest; + } + } + + private bool ShouldGenerateParams(string existingParams) + { + return string.IsNullOrWhiteSpace(existingParams) || + existingParams == "AUTO_GENERATE" || + !IsValidBase64(existingParams); + } + + private void EnsureContext(JObject requestJson) + { + if (requestJson["context"] == null) + { + requestJson["context"] = new JObject + { + ["client"] = new JObject + { + ["clientName"] = "WEB", + ["clientVersion"] = "2.20250923.08.00" + } + }; + } + } + + private HttpRequestMessage CreateRequestWithHeaders(HttpRequestMessage originalRequest, string content, string videoId) + { + var newRequest = new HttpRequestMessage(originalRequest.Method, originalRequest.RequestUri); + + // Add YouTube-specific headers for bot detection avoidance + var headers = new Dictionary + { + ["User-Agent"] = DefaultUserAgent, + ["Accept"] = "application/json", + ["Accept-Language"] = "en-US,en;q=0.9", + ["Origin"] = "https://www.youtube.com", + ["Referer"] = $"https://www.youtube.com/watch?v={videoId}", + ["Sec-Ch-Ua"] = "\"Not_A Brand\";v=\"99\", \"Google Chrome\";v=\"109\", \"Chromium\";v=\"109\"", + ["Sec-Ch-Ua-Mobile"] = "?0", + ["Sec-Ch-Ua-Platform"] = "\"Windows\"", + ["Sec-Fetch-Dest"] = "empty", + ["Sec-Fetch-Mode"] = "cors", + ["Sec-Fetch-Site"] = "same-origin" + }; + + foreach (var header in headers) + { + newRequest.Headers.TryAddWithoutValidation(header.Key, header.Value); + } + + // Copy non-conflicting original headers + foreach (var header in originalRequest.Headers) + { + if (!headers.ContainsKey(header.Key)) + { + newRequest.Headers.TryAddWithoutValidation(header.Key, header.Value); + } + } + + newRequest.Content = new StringContent(content, Encoding.UTF8, "application/json"); + return newRequest; + } + + private string GenerateYouTubeParams(string videoId) + { + try + { + // Optimized protobuf generation using pre-calculated byte arrays + var inner = new List(20); // Pre-allocate capacity + + // Field 1: "asr" + inner.AddRange(new byte[] { 0x0A, 0x03 }); + inner.AddRange(Encoding.UTF8.GetBytes("asr")); + + // Field 2: "en" + inner.AddRange(new byte[] { 0x12, 0x02 }); + inner.AddRange(Encoding.UTF8.GetBytes("en")); + + // Field 3: empty bytes + inner.AddRange(new byte[] { 0x1A, 0x00 }); + + var innerB64 = ToBase64Url(inner.ToArray()); + var innerB64UrlEncoded = Uri.EscapeDataString(innerB64); + + var outer = new List(50); // Pre-allocate capacity + + // Field 1: videoId + outer.Add(0x0A); + outer.Add((byte)videoId.Length); + outer.AddRange(Encoding.UTF8.GetBytes(videoId)); + + // Field 2: encoded inner params + outer.Add(0x12); + var innerBytes = Encoding.UTF8.GetBytes(innerB64UrlEncoded); + outer.Add((byte)innerBytes.Length); + outer.AddRange(innerBytes); + + return ToBase64Url(outer.ToArray()); + } + catch (Exception ex) + { + this.Context.Logger.LogWarning($"Params generation failed, using fallback: {ex.Message}"); + return "CgtEQzJwM2tGamNLMBISQ2dOaGMzSVNBbVZ1R2dBJTNE"; // Fallback + } + } + + private bool IsValidBase64(string value) + { + if (string.IsNullOrWhiteSpace(value)) + return false; + + try + { + var base64 = value.Replace('-', '+').Replace('_', '/'); + while (base64.Length % 4 != 0) + base64 += "="; + + Convert.FromBase64String(base64); + return true; + } + catch + { + return false; + } + } + + private string ToBase64Url(byte[] bytes) => + Convert.ToBase64String(bytes).Replace('+', '-').Replace('/', '_').TrimEnd('='); + + private string TransformYouTubeResponse(string youtubeResponse) + { + try + { + var youtubeData = JObject.Parse(youtubeResponse); + var segments = ExtractTranscriptSegments(youtubeData); + + var result = new JObject + { + ["success"] = true, + ["segments"] = JArray.FromObject(segments), + ["totalSegments"] = segments.Count, + ["totalDurationMs"] = segments.Count > 0 ? segments.Last().EndMs : 0, + ["totalDurationFormatted"] = segments.Count > 0 ? FormatDuration(segments.Last().EndMs) : "0:00", + ["fullTranscript"] = string.Join(" ", segments.Select(s => s.Text)), + ["language"] = ExtractLanguageInfo(youtubeData), + ["processedAt"] = DateTime.UtcNow.ToString("yyyy-MM-ddTHH:mm:ssZ") + }; + + return result.ToString(); + } + catch (Exception ex) + { + this.Context.Logger.LogError($"Response transformation error: {ex.Message}"); + return CreateFallbackResponse(youtubeResponse, ex.Message).ToString(); + } + } + + private List ExtractTranscriptSegments(JObject youtubeData) + { + var segments = new List(); + + // Try primary navigation path + if (TryExtractFromActions(youtubeData, segments)) + { + this.Context.Logger.LogInformation($"Primary extraction successful: {segments.Count} segments found"); + return segments.OrderBy(s => s.StartMs).ToList(); + } + + // Fallback: deep search with early termination + this.Context.Logger.LogInformation("Primary extraction failed, attempting deep search"); + SearchForTranscriptContentOptimized(youtubeData, segments, 0, MaxSearchDepth); + this.Context.Logger.LogInformation($"Deep search completed: {segments.Count} segments found"); + + return segments.OrderBy(s => s.StartMs).ToList(); + } + + private bool TryExtractFromActions(JObject youtubeData, List segments) + { + try + { + // Log the top-level structure for debugging + this.Context.Logger.LogInformation($"YouTube response top-level keys: {string.Join(", ", youtubeData.Properties().Select(p => p.Name))}"); + + var actions = youtubeData["actions"]; + this.Context.Logger.LogInformation($"Actions found: {actions != null}, HasValues: {actions?.HasValues}"); + + if (actions?.HasValues == true) + { + this.Context.Logger.LogInformation($"Found {actions.Count()} actions"); + var firstAction = actions[0]; + this.Context.Logger.LogInformation($"First action keys: {string.Join(", ", ((JObject)firstAction).Properties().Select(p => p.Name))}"); + + var updateAction = firstAction["updateEngagementPanelAction"]; + this.Context.Logger.LogInformation($"UpdateEngagementPanelAction found: {updateAction != null}"); + + if (updateAction != null) + { + var content = updateAction["content"]; + this.Context.Logger.LogInformation($"Content found: {content != null}"); + + if (content != null) + { + this.Context.Logger.LogInformation($"Content keys: {string.Join(", ", ((JObject)content).Properties().Select(p => p.Name))}"); + var transcriptRenderer = content["transcriptRenderer"]; + this.Context.Logger.LogInformation($"TranscriptRenderer found: {transcriptRenderer != null}"); + + if (transcriptRenderer != null) + { + this.Context.Logger.LogInformation($"TranscriptRenderer keys: {string.Join(", ", ((JObject)transcriptRenderer).Properties().Select(p => p.Name))}"); + } + } + } + } + + var cueGroups = youtubeData["actions"]?[0]?["updateEngagementPanelAction"]?["content"]? + ["transcriptRenderer"]?["body"]?["transcriptBodyRenderer"]?["cueGroups"]; + + if (cueGroups?.HasValues == true) + { + this.Context.Logger.LogInformation($"Found {cueGroups.Count()} cue groups in primary path"); + foreach (var cueGroup in cueGroups) + { + var segment = CreateTranscriptSegment(cueGroup["transcriptSegmentRenderer"]); + if (segment != null) + segments.Add(segment); + } + return segments.Count > 0; + } + + // Try alternative structure + var altTranscriptRenderer = youtubeData["actions"]?[0]?["updateEngagementPanelAction"]?["content"]?["transcriptRenderer"]; + if (altTranscriptRenderer != null) + { + this.Context.Logger.LogInformation("Trying alternative structure extraction"); + return TryExtractFromAlternativeStructure(altTranscriptRenderer, segments); + } + } + catch (Exception ex) + { + this.Context.Logger.LogWarning($"Primary extraction failed: {ex.Message}"); + } + + return false; + } + + private bool TryExtractFromAlternativeStructure(JToken transcriptRenderer, List segments) + { + this.Context.Logger.LogInformation($"Alternative structure - TranscriptRenderer keys: {string.Join(", ", ((JObject)transcriptRenderer).Properties().Select(p => p.Name))}"); + + var content = transcriptRenderer["content"]; + if (content != null) + { + this.Context.Logger.LogInformation($"Content keys: {string.Join(", ", ((JObject)content).Properties().Select(p => p.Name))}"); + + var searchPanel = content["transcriptSearchPanelRenderer"]; + if (searchPanel != null) + { + this.Context.Logger.LogInformation("Found transcriptSearchPanelRenderer, attempting extraction"); + this.Context.Logger.LogInformation($"SearchPanel keys: {string.Join(", ", ((JObject)searchPanel).Properties().Select(p => p.Name))}"); + + // Look for body in search panel + var body = searchPanel["body"]; + if (body != null) + { + this.Context.Logger.LogInformation($"SearchPanel body keys: {string.Join(", ", ((JObject)body).Properties().Select(p => p.Name))}"); + + var bodyRenderer = body["transcriptSegmentListRenderer"]; + if (bodyRenderer != null) + { + this.Context.Logger.LogInformation($"TranscriptSegmentListRenderer keys: {string.Join(", ", ((JObject)bodyRenderer).Properties().Select(p => p.Name))}"); + + var initialSegments = bodyRenderer["initialSegments"]; + if (initialSegments?.HasValues == true) + { + this.Context.Logger.LogInformation($"Found {initialSegments.Count()} initial segments"); + + foreach (var segmentItem in initialSegments) + { + var renderer = segmentItem["transcriptSegmentRenderer"]; + if (renderer != null) + { + var segment = CreateTranscriptSegment(renderer); + if (segment != null) + segments.Add(segment); + } + } + + if (segments.Count > 0) + { + this.Context.Logger.LogInformation($"Extracted {segments.Count} segments from initialSegments"); + return true; + } + } + } + } + + // If no segments found in structured approach, try deep search + SearchForTranscriptContentOptimized(searchPanel, segments, 0, 3); + if (segments.Count > 0) + { + this.Context.Logger.LogInformation($"Extracted {segments.Count} segments from search panel deep search"); + return true; + } + } + + // Try direct content search as fallback + SearchForTranscriptContentOptimized(content, segments, 0, 3); + if (segments.Count > 0) + { + this.Context.Logger.LogInformation($"Extracted {segments.Count} segments from direct content"); + return true; + } + } + + return false; + } + + private void SearchForTranscriptContentOptimized(JToken token, List segments, int depth, int maxDepth) + { + if (depth > maxDepth || token == null || segments.Count > 1000) // Early termination + return; + + if (token.Type == JTokenType.Object) + { + var obj = (JObject)token; + + // Quick check for transcript patterns + if (obj.TryGetValue("startMs", out var startMsToken) && + (obj.TryGetValue("snippet", out _) || obj.TryGetValue("text", out _))) + { + var segment = CreateTranscriptSegment(obj); + if (segment != null) + segments.Add(segment); + } + + // Continue searching efficiently + foreach (var property in obj.Properties()) + { + if (segments.Count > 1000) break; // Prevent excessive processing + SearchForTranscriptContentOptimized(property.Value, segments, depth + 1, maxDepth); + } + } + else if (token.Type == JTokenType.Array) + { + foreach (var item in (JArray)token) + { + if (segments.Count > 1000) break; + SearchForTranscriptContentOptimized(item, segments, depth + 1, maxDepth); + } + } + } + + private TranscriptSegment CreateTranscriptSegment(JToken renderer) + { + if (renderer == null) + return null; + + try + { + var startMs = ParseMilliseconds(renderer["startMs"]?.ToString()); + var endMs = ParseMilliseconds(renderer["endMs"]?.ToString()); + var text = ExtractTextFromRuns(renderer["snippet"]?["runs"]) ?? + renderer["text"]?.ToString() ?? ""; + + if (string.IsNullOrWhiteSpace(text)) + return null; + + var cleanText = CleanTranscriptTextOptimized(text); + var durationMs = endMs - startMs; + + return new TranscriptSegment + { + Text = cleanText, + StartMs = startMs, + EndMs = endMs, + DurationMs = durationMs, + StartTime = renderer["startTimeText"]?["simpleText"]?.ToString() ?? "", + StartTimeFormatted = FormatDuration(startMs), + EndTimeFormatted = FormatDuration(endMs), + DurationFormatted = FormatDuration(durationMs), + WordCount = CountWordsOptimized(cleanText), + CharacterCount = cleanText.Length + }; + } + catch + { + return null; + } + } + + private string ExtractTextFromRuns(JToken runs) + { + if (runs?.HasValues != true) + return ""; + + var sb = new StringBuilder(); + foreach (var run in runs) + { + var text = run["text"]?.ToString(); + if (!string.IsNullOrEmpty(text)) + sb.Append(text); + } + + return sb.ToString(); + } + + private string CleanTranscriptTextOptimized(string text) + { + if (string.IsNullOrWhiteSpace(text)) + return ""; + + text = text.Trim() + .Replace("♪", ""); + + // Use compiled regexes for better performance + text = WhitespaceRegex.Replace(text, " "); + text = SoundEffectsRegex.Replace(text, ""); + text = BackgroundNoiseRegex.Replace(text, ""); + + return text.Trim(); + } + + private int CountWordsOptimized(string text) => + string.IsNullOrWhiteSpace(text) ? 0 : + text.Split(new[] { ' ', '\t', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries).Length; + + private int ParseMilliseconds(string msString) => + int.TryParse(msString, out int ms) ? ms : 0; + + private string FormatDuration(int milliseconds) + { + var timeSpan = TimeSpan.FromMilliseconds(milliseconds); + return timeSpan.TotalHours >= 1 ? + $"{(int)timeSpan.TotalHours}:{timeSpan.Minutes:D2}:{timeSpan.Seconds:D2}" : + $"{timeSpan.Minutes}:{timeSpan.Seconds:D2}"; + } + + private string ExtractLanguageInfo(JObject youtubeData) + { + try + { + var languageMenu = youtubeData["actions"]?[0]?["updateEngagementPanelAction"]?["content"]? + ["transcriptRenderer"]?["footer"]?["transcriptFooterRenderer"]?["languageMenu"]; + + if (languageMenu != null) + { + var selectedItem = languageMenu["sortFilterSubMenuRenderer"]?["subMenuItems"]? + .FirstOrDefault(item => item["selected"]?.Value() == true); + + return selectedItem?["title"]?.ToString() ?? DefaultLanguage; + } + } + catch + { + // Graceful fallback + } + + return DefaultLanguage; + } + + private JObject CreateFallbackResponse(string originalResponse, string errorMessage) => + new JObject + { + ["success"] = false, + ["error"] = $"Response transformation failed: {errorMessage}", + ["segments"] = new JArray(), + ["totalSegments"] = 0, + ["processedAt"] = DateTime.UtcNow.ToString("yyyy-MM-ddTHH:mm:ssZ"), + ["rawResponseSample"] = originalResponse.Length > 2000 ? + originalResponse.Substring(0, 2000) + "..." : originalResponse + }; + + private HttpResponseMessage CreateErrorResponse(HttpStatusCode statusCode, string message) + { + var errorResponse = new HttpResponseMessage(statusCode); + var errorObject = new JObject + { + ["success"] = false, + ["error"] = message, + ["segments"] = new JArray(), + ["totalSegments"] = 0, + ["processedAt"] = DateTime.UtcNow.ToString("yyyy-MM-ddTHH:mm:ssZ") + }; + + errorResponse.Content = CreateJsonContent(errorObject.ToString()); + return errorResponse; + } + + private class TranscriptSegment + { + public string Text { get; set; } + public int StartMs { get; set; } + public int EndMs { get; set; } + public int DurationMs { get; set; } + public string StartTime { get; set; } + public string StartTimeFormatted { get; set; } + public string EndTimeFormatted { get; set; } + public string DurationFormatted { get; set; } + public int WordCount { get; set; } + public int CharacterCount { get; set; } + } +} \ No newline at end of file