Skip to content

Commit 2444c04

Browse files
authored
Merge pull request #102 from r-world-devs/waisk/34/prepare_release
pre-release
2 parents 3f8c005 + 8c35f51 commit 2444c04

26 files changed

+693
-203
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,7 @@
22
.Renviron
33
docs
44
inst/demo-app/rsconnect/
5+
6+
/.quarto/
7+
README.html
8+
inst/doc

DESCRIPTION

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
Package: GitAI
22
Title: Extracts Knowledge From Git Repositories
3-
Version: 0.0.0.9015
3+
Version: 0.0.0.9017
44
Authors@R: c(
55
person("Kamil", "Wais", , "[email protected]", role = c("aut", "cre")),
66
person("Krystian", "Igras", , "[email protected]", role = "aut"),
77
person("Maciej", "Banas", , "[email protected]", role = "aut")
88
)
9-
Description: Scan multiple Git repositories, pull specified files content and process it with Large Language Models. You can summarize the content in specific way, extract information and data, or find answers to your questions about the repositories.
9+
Description: Scan multiple Git repositories, pull specified files content and process it with Large Language Models. You can summarize the content in specific way, extract information and data, or find answers to your questions about the repositories. The output can be stored in vector database and used for semantic search or as a part of a RAG (Retrieval Augmented Generation) prompt.
1010
License: MIT + file LICENSE
1111
Encoding: UTF-8
1212
Roxygen: list(markdown = TRUE)
@@ -15,8 +15,8 @@ Depends:
1515
R (>= 4.1.0)
1616
Imports:
1717
cli (>= 3.4.0),
18-
elmer,
19-
GitStats,
18+
ellmer,
19+
GitStats (>= 2.2.0),
2020
httr2,
2121
lubridate,
2222
R6,
@@ -28,5 +28,9 @@ Imports:
2828
Suggests:
2929
testthat (>= 3.0.0),
3030
shiny,
31-
withr
31+
shinychat,
32+
withr,
33+
knitr,
34+
rmarkdown
3235
Config/testthat/edition: 3
36+
VignetteBuilder: knitr

R/GitAI-package.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@
88
#' within file marked at '.Rbuildignore' file.
99
missing_deps_note_fix <- function() {
1010
R6::R6Class
11-
elmer::chat_ollama
11+
ellmer::chat_ollama
1212
lubridate::as_datetime
1313
}

R/Pinecone.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ Pinecone <- R6::R6Class(
114114
})
115115
},
116116

117-
list_record_IDs = function() {
117+
list_record_ids = function() {
118118

119119
pinecone_api_key <- Sys.getenv("PINECONE_API_KEY")
120120

@@ -147,7 +147,7 @@ Pinecone <- R6::R6Class(
147147
has_next_page <- "pagination" %in% names(response_body)
148148
}
149149

150-
return(record_ids)
150+
record_ids
151151
},
152152

153153
purge_records = function(ids) {

R/process_content.R

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ process_content <- function(gitai, content, max_words = 80000, verbose) {
1616
}
1717

1818
llm_clone <- gitai$llm$clone(deep = TRUE)
19-
2019
llm_clone$chat(content)
2120

2221
turn <- llm_clone$last_turn("assistant")

R/process_repos.R

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,12 @@ process_repos <- function(
2121
add_contributors = FALSE,
2222
verbose = verbose
2323
)
24-
GitStats::get_files_structure(
24+
files_content <- GitStats::get_files(
2525
gitstats,
2626
pattern = paste0(gitai$files, collapse = "|"),
2727
depth = depth,
2828
verbose = verbose
2929
)
30-
files_content <- GitStats::get_files_content(gitstats, verbose = verbose)
3130

3231
distinct_repos <- files_content |>
3332
dplyr::distinct(repo_name, api_url)

R/set_llm.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@
33
#' @name set_llm
44
#' @param gitai A \code{GitAI} object.
55
#' @param provider Name of LLM provider, a string. Results with setting up LLM using
6-
#' \code{elmer::chat_<provider>} function.
7-
#' @param ... Other arguments to pass to corresponding \code{elmer::chat_<provider>} function.
6+
#' \code{ellmer::chat_<provider>} function.
7+
#' @param ... Other arguments to pass to corresponding \code{ellmer::chat_<provider>} function.
88
#' Please use \link{get_llm_defaults} to get default model arguments.
99
#' @return A \code{GitAI} object.
1010
#' @export
1111
set_llm <- function(gitai, provider = "openai", ...) {
1212

1313
provider_method <- rlang::env_get(
14-
env = asNamespace("elmer"),
14+
env = asNamespace("ellmer"),
1515
nm = glue::glue("chat_{provider}")
1616
)
1717
provider_args <- purrr::list_modify(

R/test-helpers.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ PineconeMocked <- R6::R6Class(
136136
})
137137
},
138138

139-
list_record_IDs = function() {
139+
list_record_ids = function() {
140140
pinecone_api_key <- Sys.getenv("PINECONE_API_KEY")
141141

142142
url <- paste0("https://", private$.index_host)
@@ -153,7 +153,7 @@ PineconeMocked <- R6::R6Class(
153153
)
154154

155155
response <- httr2::response_json(
156-
body = test_fixtures[["list_record_IDs"]]
156+
body = test_fixtures[["list_record_ids"]]
157157
)
158158

159159
response_body <- httr2::resp_body_json(response)
@@ -261,7 +261,7 @@ test_fixtures[["read_record"]] <- list(
261261
"usage" = list("readUnits" = 1L)
262262
)
263263

264-
test_fixtures[["list_record_IDs"]] <- list(
264+
test_fixtures[["list_record_ids"]] <- list(
265265
"vectors" = list(
266266
list(
267267
"id" = "project_1"

README.Rmd

Lines changed: 67 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,46 @@ knitr::opts_chunk$set(
1919
[![Codecov test coverage](https://codecov.io/gh/r-world-devs/GitAI/graph/badge.svg)](https://app.codecov.io/gh/r-world-devs/GitAI)
2020
<!-- badges: end -->
2121

22-
The goal of GitAI is to derive knowledge from GitHub or GitLab repositories with the use of AI/LLM (Large Language Models). With GitAI you can easily:
22+
> The goal of `GitAI` is to **extract knowledge from Git repositories** with the use of AI/LLM (Large Language Models).
2323
24-
- set up your project scope (Git repositories),
25-
- select content of interest (files and file types),
26-
- choose your LLM backend,
27-
- define the LLM prompts,
28-
- process content of all repositories with a single function call.
24+
## Motivation
2925

30-
And all of that in a nice tidyverse style.
26+
Large organizations need to deal with massive number of git repositories
27+
(both internal and external). Those repositories can be hosted on different
28+
platforms (like `GitHub` and `GitLab`).
29+
30+
It is very difficult or even impossible to review all those repositories
31+
manually, especially if one needs to perform an exploratory search,
32+
not knowing the exact keywords that should be used.
33+
34+
Because of that the reusability of the knowledge (and code) hidden in the
35+
repositories is a constant challenge.
36+
37+
## Solution
38+
39+
We propose the `GitAI` framework written in R.
40+
41+
It is applicable to multiple use cases related to extracting knowledge from Git repositories.
42+
At the same time, is IT infrastructure agnostic. It is designed to work with
43+
different backends, LLMs, embeddings models, and vector databases.
44+
Adapting to particular backends may need implementation of new classes, but
45+
the core functionality stays the same.
46+
47+
## Workflow
48+
49+
Typical `GitAI` workflow looks like that:
50+
51+
1. Set up your project.
52+
1. Set up your project scope (Git repositories).
53+
1. Select content type of interest (files and file types).
54+
1. Choose your LLM backend.
55+
1. Define the LLM prompts.
56+
1. (Optional) Choose embedding model and vector database provider.
57+
1. Process content of all repositories with a single function call.
58+
1. (Optional) If vector database is setup, the results will be stored there.
59+
1. Use the information extracted from files content from git repositories.
60+
1. (Optional) If results are stored in vector database,
61+
they can be searched using *semantic search* or used as a part of a RAG (*Retrieval Augmented Generation*) prompt.
3162

3263
## Installation
3364

@@ -38,21 +69,43 @@ You can install the development version of `GitAI` from [GitHub](https://github.
3869
pak::pak("r-world-devs/GitAI")
3970
```
4071

41-
## Example workflow
42-
43-
Basic workflow could look like:
72+
## Simplified example (without vector database usage)
4473

4574
```{r}
4675
library(GitAI)
47-
# Set up project
76+
```
77+
78+
Let's set up a project `fascinating_project` that will extract some summaries from the content of the `README.md` files in the few selected git repositories.
79+
80+
81+
```{r}
82+
options(ellmer_timeout_s = 120)
4883
verbose_off()
4984
my_project <- initialize_project("fascinating_project") |>
50-
set_github_repos(repos = c("r-world-devs/GitStats", "r-world-devs/GitAI", "openpharma/DataFakeR")) |>
85+
set_github_repos(
86+
repos = c(
87+
"r-world-devs/GitStats",
88+
"r-world-devs/GitAI",
89+
"openpharma/DataFakeR"
90+
)
91+
) |>
5192
add_files(files = "README.md") |>
5293
set_llm() |>
5394
set_prompt("Write one-sentence summary for a project based on given input.")
95+
```
5496

55-
# Get the results
97+
Now, let's get the results and print them.
98+
99+
```{r}
56100
results <- process_repos(my_project)
57-
purrr::map(results, ~.$text)
101+
102+
purrr::walk(results, function(result) {
103+
result$text |> stringr::str_wrap(width = 80) |> cat("\n\n")
104+
})
58105
```
106+
107+
## See also
108+
109+
Our `GitAI` uses under the hood the `GitStats` R package.
110+
If you want to use it directly for pulling git data, check out:
111+
[https://r-world-devs.github.io/GitStats/](https://r-world-devs.github.io/GitStats/)

README.md

Lines changed: 83 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,51 @@
99
coverage](https://codecov.io/gh/r-world-devs/GitAI/graph/badge.svg)](https://app.codecov.io/gh/r-world-devs/GitAI)
1010
<!-- badges: end -->
1111

12-
The goal of GitAI is to derive knowledge from GitHub or GitLab
13-
repositories with the use of AI/LLM (Large Language Models). With GitAI
14-
you can easily:
12+
> The goal of `GitAI` is to **extract knowledge from Git repositories**
13+
> with the use of AI/LLM (Large Language Models).
1514
16-
- set up your project scope (Git repositories),
17-
- select content of interest (files and file types),
18-
- choose your LLM backend,
19-
- define the LLM prompts,
20-
- process content of all repositories with a single function call.
15+
## Motivation
2116

22-
And all of that in a nice tidyverse style.
17+
Large organizations need to deal with massive number of git repositories
18+
(both internal and external). Those repositories can be hosted on
19+
different platforms (like `GitHub` and `GitLab`).
20+
21+
It is very difficult or even impossible to review all those repositories
22+
manually, especially if one needs to perform an exploratory search, not
23+
knowing the exact keywords that should be used.
24+
25+
Because of that the reusability of the knowledge (and code) hidden in
26+
the repositories is a constant challenge.
27+
28+
## Solution
29+
30+
We propose the `GitAI` framework written in R.
31+
32+
It is applicable to multiple use cases related to extracting knowledge
33+
from Git repositories. At the same time, is IT infrastructure agnostic.
34+
It is designed to work with different backends, LLMs, embeddings models,
35+
and vector databases. Adapting to particular backends may need
36+
implementation of new classes, but the core functionality stays the
37+
same.
38+
39+
## Workflow
40+
41+
Typical `GitAI` workflow looks like that:
42+
43+
1. Set up your project.
44+
1. Set up your project scope (Git repositories).
45+
2. Select content type of interest (files and file types).
46+
3. Choose your LLM backend.
47+
4. Define the LLM prompts.
48+
5. (Optional) Choose embedding model and vector database provider.
49+
2. Process content of all repositories with a single function call.
50+
1. (Optional) If vector database is setup, the results will be
51+
stored there.
52+
3. Use the information extracted from files content from git
53+
repositories.
54+
1. (Optional) If results are stored in vector database, they can be
55+
searched using *semantic search* or used as a part of a RAG
56+
(*Retrieval Augmented Generation*) prompt.
2357

2458
## Installation
2559

@@ -31,29 +65,56 @@ You can install the development version of `GitAI` from
3165
pak::pak("r-world-devs/GitAI")
3266
```
3367

34-
## Example workflow
35-
36-
Basic workflow could look like:
68+
## Simplified example (without vector database usage)
3769

3870
``` r
3971
library(GitAI)
40-
# Set up project
72+
```
73+
74+
Let’s set up a project `fascinating_project` that will extract some
75+
summaries from the content of the `README.md` files in the few selected
76+
git repositories.
77+
78+
``` r
79+
options(ellmer_timeout_s = 120)
4180
verbose_off()
4281
my_project <- initialize_project("fascinating_project") |>
43-
set_github_repos(repos = c("r-world-devs/GitStats", "r-world-devs/GitAI", "openpharma/DataFakeR")) |>
82+
set_github_repos(
83+
repos = c(
84+
"r-world-devs/GitStats",
85+
"r-world-devs/GitAI",
86+
"openpharma/DataFakeR"
87+
)
88+
) |>
4489
add_files(files = "README.md") |>
4590
set_llm() |>
4691
set_prompt("Write one-sentence summary for a project based on given input.")
92+
```
93+
94+
Now, let’s get the results and print them.
4795

48-
# Get the results
96+
``` r
4997
results <- process_repos(my_project)
50-
purrr::map(results, ~.$text)
51-
#> $GitStats
52-
#> [1] "GitStats is an R package that enables users to extract and analyze GitHub and GitLab data, such as repository details, commits, and user activity, in a standardized table format."
98+
99+
purrr::walk(results, function(result) {
100+
result$text |> stringr::str_wrap(width = 80) |> cat("\n\n")
101+
})
102+
#> GitStats is an experimental R package that facilitates the extraction
103+
#> and analysis of git data from GitHub and GitLab, providing insights into
104+
#> repositories, commits, users, and R package usage in a structured format.
53105
#>
54-
#> $GitAI
55-
#> [1] "GitAI is an R package designed to harness the power of AI and Large Language Models to extract insights from GitHub or GitLab repositories in a user-friendly, tidyverse style, enabling users to set project scopes, select content of interest, and process repositories with ease."
106+
#> GitAI is an R package that leverages AI and Large Language Models to extract
107+
#> insights from GitHub or GitLab repositories, allowing users to define project
108+
#> scopes, select relevant content, and process repositories efficiently in a
109+
#> tidyverse-compliant manner.
56110
#>
57-
#> $DataFakeR
58-
#> [1] "DataFakeR is an experimental R package designed to generate fake data samples that maintain specified characteristics of original datasets, streamlined through customizable configurations and schema management."
111+
#> DataFakeR is an R package that enables users to generate synthetic datasets
112+
#> while maintaining specified assumptions about the original data structure,
113+
#> facilitating data simulation for testing and analysis.
59114
```
115+
116+
## See also
117+
118+
Our `GitAI` uses under the hood the `GitStats` R package. If you want to
119+
use it directly for pulling git data, check out:
120+
<https://r-world-devs.github.io/GitStats/>

0 commit comments

Comments
 (0)