From 7435e423776c7b35b9c6c9bebba25a44691554bf Mon Sep 17 00:00:00 2001 From: Nate Buttke Date: Mon, 14 Aug 2023 20:35:45 -0700 Subject: huge fix to path handling. added clean cli, cost estimate, ignorefile. --- requirements.txt | 14 ++-- server.py | 208 +++++++++++++++++++++++++++++++++---------------------- setup.py | 200 +++++++++++++++++++++++++++++++++------------------- 3 files changed, 263 insertions(+), 159 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1beac1b..b60ffb4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ -Flask==2.3.2 -numpy==1.25.0 -openai==0.27.8 -pandas==2.0.3 -tiktoken==0.4.0 -tree_sitter==0.20.1 +Flask +numpy +openai +pandas +tiktoken +tree_sitter +typer +typing_extensions diff --git a/server.py b/server.py index 2bb203d..1cd914c 100644 --- a/server.py +++ b/server.py @@ -1,49 +1,56 @@ -from flask import Flask, request, Response, jsonify, render_template -import pandas as pd from collections import defaultdict import os -import json +import sys +from pathlib import PurePosixPath +import base64 +from flask import Flask, request, jsonify, render_template +import pandas as pd import openai from openai.embeddings_utils import get_embedding, cosine_similarity openai.api_key = os.getenv('OPENAI_KEY') -import numpy as np -from pathlib import PurePosixPath -FILETYPES = ['.sh', '.c', '.h', '.cpp', '.cxx', '.hxx', '.hpp', '.go', '.hs', '.js', '.py', '.rs'] +if len(sys.argv) != 2: + print("USAGE: python server.py PATH_TO_CSV") + print("wrong number of arguments", file=sys.stderr) + sys.exit(1) +try: + df=pd.read_csv(sys.argv[1], converters={"embedding_summary": pd.eval}) +except: + print(f"Problem opening {sys.argv[1]}", file=sys.stderr) + sys.exit(1) + app = Flask(__name__, template_folder="./frontend", static_folder="./frontend", static_url_path="") def search_code(df, query, n=4): query_embedding = get_embedding( - query, - engine="text-embedding-ada-002" - ) + query, + engine="text-embedding-ada-002" + ) df["similarity"] = df.embedding_summary.apply(lambda x: cosine_similarity(x, query_embedding)) results = ( - df.sort_values("similarity", ascending=False) - ) + df.sort_values("similarity", ascending=False) + ) return results.head(n) - def generate_answer(question): - results = search_code(df, question, n=4) - prompt = '' - for i in range(3): - prompt += results.iloc[i]["summary"] + "\n" + results.iloc[i]["blob"] + "\n" - prompt += "\n" + "Answer the following question using the code context given above, and show an example with 'Example'\nQ: " + question + "\nA: " - response = openai.Completion.create( - model="text-davinci-003", - # model="code-davinci-002", - prompt=prompt, - temperature=0.7, - max_tokens=1000, - top_p=1.0, - frequency_penalty=0.0, - presence_penalty=0.0, - stop=["\"\"\""] - ) - return response["choices"][0]["text"] - + results = search_code(df, question, n=4) + prompt = '' + for i in range(3): + prompt += results.iloc[i]["summary"] + "\n" + results.iloc[i]["blob"] + "\n" + prompt += "\n" + "Answer the following question using the code context given above, and show an example with 'Example'\nQ: " + question + "\nA: " + response = openai.Completion.create( + model="text-davinci-003", + # model="code-davinci-002", + prompt=prompt, + temperature=0.7, + max_tokens=1000, + top_p=1.0, + frequency_penalty=0.0, + presence_penalty=0.0, + stop=["\"\"\""] + ) + return response["choices"][0]["text"] def add_to_tree(tree: dict, path: str): parts = PurePosixPath(path).parts @@ -65,72 +72,111 @@ def create_directory_tree(df): add_to_tree(directory_tree, path) return directory_tree -# Nate: these are from the original project. My own csv is below -#df = pd.read_csv("./frontend/data/embedded_summarized.csv") -#df = pd.read_csv("./frontend/data/withsummary2.csv") +def get_outermost_item(dirstructure): + return list(dirstructure.keys())[0] -# My line -df=pd.read_csv("./frontend/data/test_with_summary_and_embeddings.csv", converters={'embedding_summary': pd.eval}) +def get_kids_of_root(dirstructure): + return list(dirstructure.values())[0].get("children").keys() -# need to do funny stuff to read in the data frame correctly from csv. that's -# why the eval() is below. and pd.eval is above. -# df['embeddings'] = df['embeddings'].apply(lambda x: eval(x)) def check_path(path, dirstructure): - children = None - components = PurePosixPath(path).parts - if(components[0] in dirstructure.keys()): - print('SCREAM') - - currentdict = dirstructure.get(components[0]) #outermost is solitary by setup.py - print(len(components)) - for component in components[1:]: - print('COMP', component) - if component in currentdict.get("children", {}): - currentdict = currentdict["children"][component] - else: - return False, None, children - if currentdict["filetype"] == "dir": - children = currentdict["children"] - return True, currentdict["filetype"], children - else: + components = PurePosixPath(path).parts + if components[0] in dirstructure.keys(): + if dirstructure[components[0]]["filetype"] == "dir": + subdict = dirstructure.get(components[0]) + if len(components) == 1: + ftype = subdict["filetype"] + kids = list(subdict["children"].keys()) + return True, str(ftype), kids + else: + for c in components[1:]: + if c in subdict["children"]: + found = True + subdict = subdict["children"].get(c) + ftype = subdict["filetype"] + kids = list(subdict["children"].keys()) if ftype == 'dir' else None + else: + found = False + if found: + return found, str(ftype), kids + else: + return True, "file", None return False, None, None + +# need to do this to read in the data frame correctly from csv. (pd.eval) +#df=pd.read_csv("./frontend/data/test_with_summary_and_embeddings.csv", converters={'embedding_summary': pd.eval}) +#df=pd.read_csv("./frontend/data/r2.csv", converters={'embedding_summary': pd.eval}) + +# old strategy: +# df['embeddings'] = df['embeddings'].apply(lambda x: eval(x)) + @app.route('/') def home(): - req_path = request.args.get('path') - dirstructure = create_directory_tree(df) - if req_path is None: - loctype = "folder" - text = [[x, ""] for x in list(df['file_path'].unique())] - else: - text = [[x, y] for x, y in zip( - list(df[df['file_path'] == req_path]['source']), - list(df[df['file_path'] == req_path]['summary']) - )] - loctype = "file" - - res = { - 'loctype': loctype, - 'text': text, - 'current': 'root directory' if (req_path is None) else req_path - } - return render_template('index.html', payload=res) + req_path = request.args.get('path') + path_decode = None if req_path is None else base64.urlsafe_b64decode(req_path).decode('utf-8') + dirstructure = create_directory_tree(df) + if req_path is None: + rootname = get_outermost_item(dirstructure) + if check_path(rootname, dirstructure)[1 == "file"]: + if len(dirstructure.keys()) == 1: + loctype = "file" + text = [[(x, None), y] for x, y in zip( + list(df[df["file_path"] == rootname]["source"]), + list(df[df["file_path"] == rootname]["summary"]) + )] + else: + loctype = "folder" + files = list(dirstructure.keys()) + text = [[(x, base64.urlsafe_b64encode(bytes(x, 'utf-8')).decode("utf-8")), ""] for x in files] + else: + loctype = "folder" + if not rootname.endswith("/"): + rootname += '/' + kids = get_kids_of_root(dirstructure) + text = [[(k, base64.urlsafe_b64encode(bytes(rootname + k, 'utf-8')).decode('utf-8')), ""] for k in kids] + else: + path_info = check_path(path_decode, dirstructure) + if path_info[0] is False: + text = [["error", "path not found"]] + loctype = "file" + elif path_info[1] == "file": + loctype = "file" + text = [[(x, None), y] for x, y in zip( + list(df[df["file_path"] == path_decode]["source"]), + list(df[df["file_path"] == path_decode]["summary"]) + )] + elif path_info[1] == "dir": + loctype = "folder" + text = [[(x, base64.urlsafe_b64encode(bytes(path_decode + "/" + x, 'utf-8')).decode("utf-8")), ""] for x in path_info[2]] + + if req_path is not None: + curr = path_decode + elif loctype == "folder": + curr = 'root directory' + else: + curr = get_outermost_item(dirstructure) + + res = { + 'loctype': loctype, + 'text': text, + 'current': curr + } + return render_template('index.html', payload=res) @app.route('/answer') def answer(): - q = request.args.get('q', '').strip() - a = search_code(df, q) - res = [{'blob': x['blob'], 'summary': x['summary']} for x in a.to_dict('records')] - - return jsonify(res) + q = request.args.get('q', '').strip() + a = search_code(df, q) + res = [{'blob': x['blob'], 'summary': x['summary']} for x in a.to_dict('records')] + return jsonify(res) @app.route('/explain') def explain(): - q = request.args.get('q', '').strip() - a = generate_answer(q) - return jsonify(a) + q = request.args.get('q', '').strip() + a = generate_answer(q) + return jsonify(a) if __name__ == '__main__': diff --git a/setup.py b/setup.py index 8ff5027..9efb57c 100644 --- a/setup.py +++ b/setup.py @@ -1,18 +1,19 @@ -from collections import defaultdict import os import sys import pandas as pd -import openai +import openai import tiktoken -from openai.embeddings_utils import get_embedding, cosine_similarity - +from openai.embeddings_utils import get_embedding from tree_sitter import Language, Parser - -SOURCE_DIR = './' +from typing_extensions import Annotated +import typer openai.api_key = os.getenv('END_OF_WORLD') class TS_Setup_Helper: + """ + Tree sitter functions and data for the setup process + """ parser: Parser ts_obj_path: str ext_map: dict @@ -50,7 +51,8 @@ class TS_Setup_Helper: self.CPP_LANGUAGE: ["""(function_definition) @function""", """(preproc_include) @import"""], self.GO_LANGUAGE: ["""(function_declaration) @function""", """(method_declaration) @method"""], self.JS_LANGUAGE: ["""[(function) (function_declaration)] @function"""], - self.PY_LANGUAGE: ["""(function_definition) @function""", """[(import_statement) (import_from_statement)] @import"""], + self.PY_LANGUAGE: ["""(function_definition) @function""", + """[(import_statement) (import_from_statement)] @import"""], self.RS_LANGUAGE: ["""(function_item) @function""", """(use_declaration) @import"""] } @@ -61,11 +63,8 @@ class TS_Setup_Helper: def ts_get_all_code_blocks(self, code_blocks, file_path, lang, tree, code): """Use treesitter to get all code blocks""" - # TODO need way to switch between declaration and definition .. - # e.g. golang does not have function definitions according to treesitter results = [ ] for query in self.qmap.get(lang): - print(query) results += self.ts_query(lang, tree, query) # TODO something like list comprehension here? @@ -81,22 +80,21 @@ class TS_Setup_Helper: code_blocks.append(return_dict) def parse_file(self, file_path): - print('parse') """take source code file and return pd dataframe""" # read file - with open(file_path, 'r') as f: + with open(file_path[0], 'r') as f: code = f.read() # Tree-Sitter - extension = os.path.splitext(file_path)[1].lstrip(".") + extension = os.path.splitext(file_path[0])[1].lstrip(".") lang = self.ext_map.get(extension) if lang is None: - raise NotImplementedError(f"The file extension .{extension} is not implemented") + raise NotImplementedError(f"The file extension .{extension} is not implemented ({file_path[0]})") self.parser.set_language(lang) tree = self.parser.parse(bytes(code, "utf8")) code_blocks = [] - self.ts_get_all_code_blocks(code_blocks, file_path, lang, tree, bytes(code, "utf8")) + self.ts_get_all_code_blocks(code_blocks, file_path[1], lang, tree, bytes(code, "utf8")) collate_types = ['import', 'assign'] tempblock = None @@ -123,72 +121,130 @@ class TS_Setup_Helper: return df -def get_files_to_parse(root_path, files_extensions_to_parse, dirs_to_ignore=['tests', 'vendor', 'unix']) -> list: +def get_files_to_parse(root_path, files_extensions_to_parse, dirs_to_ignore) -> list: """get all source file paths as list.""" files_to_parse = [] for root, dirs, files in os.walk(root_path): - # there is probably a better way to do this + # there may be a better way to do this # https://stackoverflow.com/questions/13454164/os-walk-without-hidden-folders - files = [f for f in files if not f[0] == '.'] + files = [ + f for f in files if (not f[0] == '.') + and (os.path.splitext(f)[-1].lstrip(".") in files_extensions_to_parse) + ] dirs[:] = [d for d in dirs if (not d[0] == '.') and (set(d.split()).isdisjoint(dirs_to_ignore))] for name in files: - #if (dirfix(root).rsplit("/", 1)[-1] in dirs_to_ignore) or (name in dirs_to_ignore) or (name.rsplit('.')[-1] not in files_extensions_to_parse): - if (name.rsplit('.')[-1] not in files_extensions_to_parse): - continue - temp_path = os.path.join(root, name) - files_to_parse.append(temp_path) + full = os.path.join(root, name) + rel_dir = os.path.relpath(root, root_path) + rel_filepath = os.path.join(rel_dir, name) + if rel_filepath.startswith("./"): + rel_filepath = rel_filepath[len("./"):] + files_to_parse.append((full, rel_filepath)) return files_to_parse def generate_summary(prompt): - enc = tiktoken.encoding_for_model("text-davinci-003") - if (len(enc.encode(prompt)) > 2500): - return "too long to summarize." - - prompt = prompt + '\nSummarize the above code: ' - response = openai.Completion.create( - model="text-davinci-003", - prompt=prompt, - temperature=0.7, - max_tokens=1024, - top_p=1.0, - frequency_penalty=0.0, - presence_penalty=0.0, - stop=["\"\"\""] - ) - return response["choices"][0]["text"] - - -# nate function to create blob. the blob just contains the file path and the source code. + enc = tiktoken.encoding_for_model("gpt-3.5-turbo") + if (len(enc.encode(prompt)) > 3000): + return "too long to summarize." + + prompt = prompt + '\nSummarize the above code: ' + + # response = openai.ChatCompletion.create( + # model="gpt-3.5-turbo", + # messages=[{"role": "user", "content": prompt}], + # temperature=0.7, + # max_tokens=1024, + # top_p=1.0, + # frequency_penalty=0.0, + # presence_penalty=0.0, + # stop=["\"\"\""] + # ) + + #return response["choices"][0]["message"]["content"] + return 'herro. this is a test summary' + +# create blob. the blob just contains the file path and the source code. def blobify(pandaSeries): return f"file path: {pandaSeries['file_path']}\n {pandaSeries['source']}" - -### doing stuff!! -ts_helper = TS_Setup_Helper('./ts-languages.so') - -code_df = pd.DataFrame() -#files = get_files_to_parse("../../dirserver/src/dirserver/", ts_helper.ext_map.keys(), dirs_to_ignore=['tests', 'vendor', 'unix']): - -files = get_files_to_parse("./rs", ts_helper.ext_map.keys()) -if len(files) == 0: - print("didn't find any files to parse", file=sys.stderr) - exit(1) -for file in files: - code_df = pd.concat([code_df, ts_helper.parse_file(file)]) - -code_df["blob"] = code_df.apply(lambda x: blobify(x),axis=1) -print(type(code_df)) -print(code_df) - -code_df.to_csv('rust_with_blob.csv') - -print('startng to generate summary') -code_df["summary"] = code_df.blob.apply(lambda x: generate_summary(x)) -print('done with generate summary') - -print('generating embeddings') -embedding_model = "text-embedding-ada-002" -code_df["embedding_summary"] = code_df.summary.apply([lambda x: get_embedding(x, engine=embedding_model)]) -print('done with generating embeddings') - -code_df.to_csv('test_with_summary_and_embeddings.csv') +def estimate_cost(df, skip_summary: bool): + enc = tiktoken.encoding_for_model("text-embedding-ada-002") + print(f'found {len(df.blob)} fns') + token_count = 0 + for s in df.blob: + token_count += len(enc.encode(s)) + embed_cost = (token_count / 1000) * 0.0001 # Ada v2 + print(f"it will cost ~${embed_cost:.6f} to generate embeddings") + + if not skip_summary: + enc = tiktoken.encoding_for_model("gpt-3.5-turbo") + token_count = 0 + for s in df.blob: + token_count += len(enc.encode(s)) + summary_cost = ((token_count / 1000) * 0.0015) + ( len(df.blob) * (500/1000) * 0.002) + print(f"it will cost ~${summary_cost:.6f} to generate summaries (see --skip-summary)") + print(f"which is ~${embed_cost + summary_cost:.6f} total.") + + if input("\nType yes to continue or anything else to quit: ") != "yes": + sys.exit(0) + return + + +def setup( + filepath: Annotated[str, typer.Argument(help="path to repo")], + output_csv_filepath: Annotated[str, typer.Argument(help="filepath for csv output")], + ignorefile: Annotated[str, typer.Option(help="Path to text file containing dirnames to ignore. One name per line.")] = None, + skip_summary: Annotated[bool, typer.Option(help="Do not produce summaries for each function (to save cost).")] = False + ): + + dirs_to_ignore = [] + if ignorefile != None: + #https://stackoverflow.com/questions/3925614/how-do-you-read-a-file-into-a-list-in-python + try: + with open(ignorefile) as file: + for line in file: + line = line.strip() + dirs_to_ignore.append(line) + except: + print(f"IO error while procesing {ignorefile}", file=sys.stderr) + + ts_helper = TS_Setup_Helper('./ts-languages.so') + code_df = pd.DataFrame() + + files = get_files_to_parse( filepath, list(ts_helper.ext_map.keys()), dirs_to_ignore) + + if len(files) == 0: + print("didn't find any files to parse", file=sys.stderr) + sys.exit(1) + for file in files: + #print(file) + code_df = pd.concat([code_df, ts_helper.parse_file(file)]) + + code_df["blob"] = code_df.apply(lambda x: blobify(x),axis=1) + + code_df.to_csv('rust_with_blob.csv') + + estimate_cost(code_df, skip_summary) + + if not skip_summary: + print('generating summary') + code_df["summary"] = code_df.blob.apply(lambda x: generate_summary(x)) + print('done with summaries') + else: + code_df["summary"] = "no summary. --skip-summary" + + print('generating embeddings') + embedding_model = "text-embedding-ada-002" + #code_df["embedding_summary"] = code_df.summary.apply( + # [lambda x: get_embedding(x, engine=embedding_model)] + # ) + print('done with embeddings') + code_df.to_csv(output_csv_filepath) + + sys.exit(0) + +if __name__ == "__main__": + typer.run(setup) + #setup('YOUR_PATH_HERE', ['ignore', 'dirs', 'here']) + #setup("../../openpilot/", "./ope.csv", + # ['tests', 'vendor', 'unix', 'test', 'debug', 'ui', 'third_party', 'tools', 'system'] + # ) -- cgit v1.2.3