summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNate Buttke <nate-web@riseup.net>2023-08-14 20:35:45 -0700
committerNate Buttke <nate-web@riseup.net>2023-08-14 20:35:45 -0700
commit7435e423776c7b35b9c6c9bebba25a44691554bf (patch)
tree2196a0e68bc9ff2c2df92590444eee8b785f4a11
parentf334391613e01057d572e0228aa4f3c2f24346dc (diff)
huge fix to path handling. added clean cli, cost estimate, ignorefile.
-rw-r--r--requirements.txt14
-rw-r--r--server.py208
-rw-r--r--setup.py200
3 files changed, 263 insertions, 159 deletions
diff --git a/requirements.txt b/requirements.txt
index 1beac1b..b60ffb4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,8 @@
-Flask==2.3.2
-numpy==1.25.0
-openai==0.27.8
-pandas==2.0.3
-tiktoken==0.4.0
-tree_sitter==0.20.1
+Flask
+numpy
+openai
+pandas
+tiktoken
+tree_sitter
+typer
+typing_extensions
diff --git a/server.py b/server.py
index 2bb203d..1cd914c 100644
--- a/server.py
+++ b/server.py
@@ -1,49 +1,56 @@
-from flask import Flask, request, Response, jsonify, render_template
-import pandas as pd
from collections import defaultdict
import os
-import json
+import sys
+from pathlib import PurePosixPath
+import base64
+from flask import Flask, request, jsonify, render_template
+import pandas as pd
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
openai.api_key = os.getenv('OPENAI_KEY')
-import numpy as np
-from pathlib import PurePosixPath
-FILETYPES = ['.sh', '.c', '.h', '.cpp', '.cxx', '.hxx', '.hpp', '.go', '.hs', '.js', '.py', '.rs']
+if len(sys.argv) != 2:
+ print("USAGE: python server.py PATH_TO_CSV")
+ print("wrong number of arguments", file=sys.stderr)
+ sys.exit(1)
+try:
+ df=pd.read_csv(sys.argv[1], converters={"embedding_summary": pd.eval})
+except:
+ print(f"Problem opening {sys.argv[1]}", file=sys.stderr)
+ sys.exit(1)
+
app = Flask(__name__, template_folder="./frontend", static_folder="./frontend", static_url_path="")
def search_code(df, query, n=4):
query_embedding = get_embedding(
- query,
- engine="text-embedding-ada-002"
- )
+ query,
+ engine="text-embedding-ada-002"
+ )
df["similarity"] = df.embedding_summary.apply(lambda x: cosine_similarity(x, query_embedding))
results = (
- df.sort_values("similarity", ascending=False)
- )
+ df.sort_values("similarity", ascending=False)
+ )
return results.head(n)
-
def generate_answer(question):
- results = search_code(df, question, n=4)
- prompt = ''
- for i in range(3):
- prompt += results.iloc[i]["summary"] + "\n" + results.iloc[i]["blob"] + "\n"
- prompt += "\n" + "Answer the following question using the code context given above, and show an example with 'Example'\nQ: " + question + "\nA: "
- response = openai.Completion.create(
- model="text-davinci-003",
- # model="code-davinci-002",
- prompt=prompt,
- temperature=0.7,
- max_tokens=1000,
- top_p=1.0,
- frequency_penalty=0.0,
- presence_penalty=0.0,
- stop=["\"\"\""]
- )
- return response["choices"][0]["text"]
-
+ results = search_code(df, question, n=4)
+ prompt = ''
+ for i in range(3):
+ prompt += results.iloc[i]["summary"] + "\n" + results.iloc[i]["blob"] + "\n"
+ prompt += "\n" + "Answer the following question using the code context given above, and show an example with 'Example'\nQ: " + question + "\nA: "
+ response = openai.Completion.create(
+ model="text-davinci-003",
+ # model="code-davinci-002",
+ prompt=prompt,
+ temperature=0.7,
+ max_tokens=1000,
+ top_p=1.0,
+ frequency_penalty=0.0,
+ presence_penalty=0.0,
+ stop=["\"\"\""]
+ )
+ return response["choices"][0]["text"]
def add_to_tree(tree: dict, path: str):
parts = PurePosixPath(path).parts
@@ -65,72 +72,111 @@ def create_directory_tree(df):
add_to_tree(directory_tree, path)
return directory_tree
-# Nate: these are from the original project. My own csv is below
-#df = pd.read_csv("./frontend/data/embedded_summarized.csv")
-#df = pd.read_csv("./frontend/data/withsummary2.csv")
+def get_outermost_item(dirstructure):
+ return list(dirstructure.keys())[0]
-# My line
-df=pd.read_csv("./frontend/data/test_with_summary_and_embeddings.csv", converters={'embedding_summary': pd.eval})
+def get_kids_of_root(dirstructure):
+ return list(dirstructure.values())[0].get("children").keys()
-# need to do funny stuff to read in the data frame correctly from csv. that's
-# why the eval() is below. and pd.eval is above.
-# df['embeddings'] = df['embeddings'].apply(lambda x: eval(x))
def check_path(path, dirstructure):
- children = None
- components = PurePosixPath(path).parts
- if(components[0] in dirstructure.keys()):
- print('SCREAM')
-
- currentdict = dirstructure.get(components[0]) #outermost is solitary by setup.py
- print(len(components))
- for component in components[1:]:
- print('COMP', component)
- if component in currentdict.get("children", {}):
- currentdict = currentdict["children"][component]
- else:
- return False, None, children
- if currentdict["filetype"] == "dir":
- children = currentdict["children"]
- return True, currentdict["filetype"], children
- else:
+ components = PurePosixPath(path).parts
+ if components[0] in dirstructure.keys():
+ if dirstructure[components[0]]["filetype"] == "dir":
+ subdict = dirstructure.get(components[0])
+ if len(components) == 1:
+ ftype = subdict["filetype"]
+ kids = list(subdict["children"].keys())
+ return True, str(ftype), kids
+ else:
+ for c in components[1:]:
+ if c in subdict["children"]:
+ found = True
+ subdict = subdict["children"].get(c)
+ ftype = subdict["filetype"]
+ kids = list(subdict["children"].keys()) if ftype == 'dir' else None
+ else:
+ found = False
+ if found:
+ return found, str(ftype), kids
+ else:
+ return True, "file", None
return False, None, None
+
+# need to do this to read in the data frame correctly from csv. (pd.eval)
+#df=pd.read_csv("./frontend/data/test_with_summary_and_embeddings.csv", converters={'embedding_summary': pd.eval})
+#df=pd.read_csv("./frontend/data/r2.csv", converters={'embedding_summary': pd.eval})
+
+# old strategy:
+# df['embeddings'] = df['embeddings'].apply(lambda x: eval(x))
+
@app.route('/')
def home():
- req_path = request.args.get('path')
- dirstructure = create_directory_tree(df)
- if req_path is None:
- loctype = "folder"
- text = [[x, ""] for x in list(df['file_path'].unique())]
- else:
- text = [[x, y] for x, y in zip(
- list(df[df['file_path'] == req_path]['source']),
- list(df[df['file_path'] == req_path]['summary'])
- )]
- loctype = "file"
-
- res = {
- 'loctype': loctype,
- 'text': text,
- 'current': 'root directory' if (req_path is None) else req_path
- }
- return render_template('index.html', payload=res)
+ req_path = request.args.get('path')
+ path_decode = None if req_path is None else base64.urlsafe_b64decode(req_path).decode('utf-8')
+ dirstructure = create_directory_tree(df)
+ if req_path is None:
+ rootname = get_outermost_item(dirstructure)
+ if check_path(rootname, dirstructure)[1 == "file"]:
+ if len(dirstructure.keys()) == 1:
+ loctype = "file"
+ text = [[(x, None), y] for x, y in zip(
+ list(df[df["file_path"] == rootname]["source"]),
+ list(df[df["file_path"] == rootname]["summary"])
+ )]
+ else:
+ loctype = "folder"
+ files = list(dirstructure.keys())
+ text = [[(x, base64.urlsafe_b64encode(bytes(x, 'utf-8')).decode("utf-8")), ""] for x in files]
+ else:
+ loctype = "folder"
+ if not rootname.endswith("/"):
+ rootname += '/'
+ kids = get_kids_of_root(dirstructure)
+ text = [[(k, base64.urlsafe_b64encode(bytes(rootname + k, 'utf-8')).decode('utf-8')), ""] for k in kids]
+ else:
+ path_info = check_path(path_decode, dirstructure)
+ if path_info[0] is False:
+ text = [["error", "path not found"]]
+ loctype = "file"
+ elif path_info[1] == "file":
+ loctype = "file"
+ text = [[(x, None), y] for x, y in zip(
+ list(df[df["file_path"] == path_decode]["source"]),
+ list(df[df["file_path"] == path_decode]["summary"])
+ )]
+ elif path_info[1] == "dir":
+ loctype = "folder"
+ text = [[(x, base64.urlsafe_b64encode(bytes(path_decode + "/" + x, 'utf-8')).decode("utf-8")), ""] for x in path_info[2]]
+
+ if req_path is not None:
+ curr = path_decode
+ elif loctype == "folder":
+ curr = 'root directory'
+ else:
+ curr = get_outermost_item(dirstructure)
+
+ res = {
+ 'loctype': loctype,
+ 'text': text,
+ 'current': curr
+ }
+ return render_template('index.html', payload=res)
@app.route('/answer')
def answer():
- q = request.args.get('q', '').strip()
- a = search_code(df, q)
- res = [{'blob': x['blob'], 'summary': x['summary']} for x in a.to_dict('records')]
-
- return jsonify(res)
+ q = request.args.get('q', '').strip()
+ a = search_code(df, q)
+ res = [{'blob': x['blob'], 'summary': x['summary']} for x in a.to_dict('records')]
+ return jsonify(res)
@app.route('/explain')
def explain():
- q = request.args.get('q', '').strip()
- a = generate_answer(q)
- return jsonify(a)
+ q = request.args.get('q', '').strip()
+ a = generate_answer(q)
+ return jsonify(a)
if __name__ == '__main__':
diff --git a/setup.py b/setup.py
index 8ff5027..9efb57c 100644
--- a/setup.py
+++ b/setup.py
@@ -1,18 +1,19 @@
-from collections import defaultdict
import os
import sys
import pandas as pd
-import openai
+import openai
import tiktoken
-from openai.embeddings_utils import get_embedding, cosine_similarity
-
+from openai.embeddings_utils import get_embedding
from tree_sitter import Language, Parser
-
-SOURCE_DIR = './'
+from typing_extensions import Annotated
+import typer
openai.api_key = os.getenv('END_OF_WORLD')
class TS_Setup_Helper:
+ """
+ Tree sitter functions and data for the setup process
+ """
parser: Parser
ts_obj_path: str
ext_map: dict
@@ -50,7 +51,8 @@ class TS_Setup_Helper:
self.CPP_LANGUAGE: ["""(function_definition) @function""", """(preproc_include) @import"""],
self.GO_LANGUAGE: ["""(function_declaration) @function""", """(method_declaration) @method"""],
self.JS_LANGUAGE: ["""[(function) (function_declaration)] @function"""],
- self.PY_LANGUAGE: ["""(function_definition) @function""", """[(import_statement) (import_from_statement)] @import"""],
+ self.PY_LANGUAGE: ["""(function_definition) @function""",
+ """[(import_statement) (import_from_statement)] @import"""],
self.RS_LANGUAGE: ["""(function_item) @function""", """(use_declaration) @import"""]
}
@@ -61,11 +63,8 @@ class TS_Setup_Helper:
def ts_get_all_code_blocks(self, code_blocks, file_path, lang, tree, code):
"""Use treesitter to get all code blocks"""
- # TODO need way to switch between declaration and definition ..
- # e.g. golang does not have function definitions according to treesitter
results = [ ]
for query in self.qmap.get(lang):
- print(query)
results += self.ts_query(lang, tree, query)
# TODO something like list comprehension here?
@@ -81,22 +80,21 @@ class TS_Setup_Helper:
code_blocks.append(return_dict)
def parse_file(self, file_path):
- print('parse')
"""take source code file and return pd dataframe"""
# read file
- with open(file_path, 'r') as f:
+ with open(file_path[0], 'r') as f:
code = f.read()
# Tree-Sitter
- extension = os.path.splitext(file_path)[1].lstrip(".")
+ extension = os.path.splitext(file_path[0])[1].lstrip(".")
lang = self.ext_map.get(extension)
if lang is None:
- raise NotImplementedError(f"The file extension .{extension} is not implemented")
+ raise NotImplementedError(f"The file extension .{extension} is not implemented ({file_path[0]})")
self.parser.set_language(lang)
tree = self.parser.parse(bytes(code, "utf8"))
code_blocks = []
- self.ts_get_all_code_blocks(code_blocks, file_path, lang, tree, bytes(code, "utf8"))
+ self.ts_get_all_code_blocks(code_blocks, file_path[1], lang, tree, bytes(code, "utf8"))
collate_types = ['import', 'assign']
tempblock = None
@@ -123,72 +121,130 @@ class TS_Setup_Helper:
return df
-def get_files_to_parse(root_path, files_extensions_to_parse, dirs_to_ignore=['tests', 'vendor', 'unix']) -> list:
+def get_files_to_parse(root_path, files_extensions_to_parse, dirs_to_ignore) -> list:
"""get all source file paths as list."""
files_to_parse = []
for root, dirs, files in os.walk(root_path):
- # there is probably a better way to do this
+ # there may be a better way to do this
# https://stackoverflow.com/questions/13454164/os-walk-without-hidden-folders
- files = [f for f in files if not f[0] == '.']
+ files = [
+ f for f in files if (not f[0] == '.')
+ and (os.path.splitext(f)[-1].lstrip(".") in files_extensions_to_parse)
+ ]
dirs[:] = [d for d in dirs if (not d[0] == '.') and (set(d.split()).isdisjoint(dirs_to_ignore))]
for name in files:
- #if (dirfix(root).rsplit("/", 1)[-1] in dirs_to_ignore) or (name in dirs_to_ignore) or (name.rsplit('.')[-1] not in files_extensions_to_parse):
- if (name.rsplit('.')[-1] not in files_extensions_to_parse):
- continue
- temp_path = os.path.join(root, name)
- files_to_parse.append(temp_path)
+ full = os.path.join(root, name)
+ rel_dir = os.path.relpath(root, root_path)
+ rel_filepath = os.path.join(rel_dir, name)
+ if rel_filepath.startswith("./"):
+ rel_filepath = rel_filepath[len("./"):]
+ files_to_parse.append((full, rel_filepath))
return files_to_parse
def generate_summary(prompt):
- enc = tiktoken.encoding_for_model("text-davinci-003")
- if (len(enc.encode(prompt)) > 2500):
- return "too long to summarize."
-
- prompt = prompt + '\nSummarize the above code: '
- response = openai.Completion.create(
- model="text-davinci-003",
- prompt=prompt,
- temperature=0.7,
- max_tokens=1024,
- top_p=1.0,
- frequency_penalty=0.0,
- presence_penalty=0.0,
- stop=["\"\"\""]
- )
- return response["choices"][0]["text"]
-
-
-# nate function to create blob. the blob just contains the file path and the source code.
+ enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
+ if (len(enc.encode(prompt)) > 3000):
+ return "too long to summarize."
+
+ prompt = prompt + '\nSummarize the above code: '
+
+ # response = openai.ChatCompletion.create(
+ # model="gpt-3.5-turbo",
+ # messages=[{"role": "user", "content": prompt}],
+ # temperature=0.7,
+ # max_tokens=1024,
+ # top_p=1.0,
+ # frequency_penalty=0.0,
+ # presence_penalty=0.0,
+ # stop=["\"\"\""]
+ # )
+
+ #return response["choices"][0]["message"]["content"]
+ return 'herro. this is a test summary'
+
+# create blob. the blob just contains the file path and the source code.
def blobify(pandaSeries):
return f"file path: {pandaSeries['file_path']}\n {pandaSeries['source']}"
-
-### doing stuff!!
-ts_helper = TS_Setup_Helper('./ts-languages.so')
-
-code_df = pd.DataFrame()
-#files = get_files_to_parse("../../dirserver/src/dirserver/", ts_helper.ext_map.keys(), dirs_to_ignore=['tests', 'vendor', 'unix']):
-
-files = get_files_to_parse("./rs", ts_helper.ext_map.keys())
-if len(files) == 0:
- print("didn't find any files to parse", file=sys.stderr)
- exit(1)
-for file in files:
- code_df = pd.concat([code_df, ts_helper.parse_file(file)])
-
-code_df["blob"] = code_df.apply(lambda x: blobify(x),axis=1)
-print(type(code_df))
-print(code_df)
-
-code_df.to_csv('rust_with_blob.csv')
-
-print('startng to generate summary')
-code_df["summary"] = code_df.blob.apply(lambda x: generate_summary(x))
-print('done with generate summary')
-
-print('generating embeddings')
-embedding_model = "text-embedding-ada-002"
-code_df["embedding_summary"] = code_df.summary.apply([lambda x: get_embedding(x, engine=embedding_model)])
-print('done with generating embeddings')
-
-code_df.to_csv('test_with_summary_and_embeddings.csv')
+def estimate_cost(df, skip_summary: bool):
+ enc = tiktoken.encoding_for_model("text-embedding-ada-002")
+ print(f'found {len(df.blob)} fns')
+ token_count = 0
+ for s in df.blob:
+ token_count += len(enc.encode(s))
+ embed_cost = (token_count / 1000) * 0.0001 # Ada v2
+ print(f"it will cost ~${embed_cost:.6f} to generate embeddings")
+
+ if not skip_summary:
+ enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
+ token_count = 0
+ for s in df.blob:
+ token_count += len(enc.encode(s))
+ summary_cost = ((token_count / 1000) * 0.0015) + ( len(df.blob) * (500/1000) * 0.002)
+ print(f"it will cost ~${summary_cost:.6f} to generate summaries (see --skip-summary)")
+ print(f"which is ~${embed_cost + summary_cost:.6f} total.")
+
+ if input("\nType yes to continue or anything else to quit: ") != "yes":
+ sys.exit(0)
+ return
+
+
+def setup(
+ filepath: Annotated[str, typer.Argument(help="path to repo")],
+ output_csv_filepath: Annotated[str, typer.Argument(help="filepath for csv output")],
+ ignorefile: Annotated[str, typer.Option(help="Path to text file containing dirnames to ignore. One name per line.")] = None,
+ skip_summary: Annotated[bool, typer.Option(help="Do not produce summaries for each function (to save cost).")] = False
+ ):
+
+ dirs_to_ignore = []
+ if ignorefile != None:
+ #https://stackoverflow.com/questions/3925614/how-do-you-read-a-file-into-a-list-in-python
+ try:
+ with open(ignorefile) as file:
+ for line in file:
+ line = line.strip()
+ dirs_to_ignore.append(line)
+ except:
+ print(f"IO error while procesing {ignorefile}", file=sys.stderr)
+
+ ts_helper = TS_Setup_Helper('./ts-languages.so')
+ code_df = pd.DataFrame()
+
+ files = get_files_to_parse( filepath, list(ts_helper.ext_map.keys()), dirs_to_ignore)
+
+ if len(files) == 0:
+ print("didn't find any files to parse", file=sys.stderr)
+ sys.exit(1)
+ for file in files:
+ #print(file)
+ code_df = pd.concat([code_df, ts_helper.parse_file(file)])
+
+ code_df["blob"] = code_df.apply(lambda x: blobify(x),axis=1)
+
+ code_df.to_csv('rust_with_blob.csv')
+
+ estimate_cost(code_df, skip_summary)
+
+ if not skip_summary:
+ print('generating summary')
+ code_df["summary"] = code_df.blob.apply(lambda x: generate_summary(x))
+ print('done with summaries')
+ else:
+ code_df["summary"] = "no summary. --skip-summary"
+
+ print('generating embeddings')
+ embedding_model = "text-embedding-ada-002"
+ #code_df["embedding_summary"] = code_df.summary.apply(
+ # [lambda x: get_embedding(x, engine=embedding_model)]
+ # )
+ print('done with embeddings')
+ code_df.to_csv(output_csv_filepath)
+
+ sys.exit(0)
+
+if __name__ == "__main__":
+ typer.run(setup)
+ #setup('YOUR_PATH_HERE', ['ignore', 'dirs', 'here'])
+ #setup("../../openpilot/", "./ope.csv",
+ # ['tests', 'vendor', 'unix', 'test', 'debug', 'ui', 'third_party', 'tools', 'system']
+ # )