From ad9ebbe7c78c2cf7c717d7898534371d59f325d9 Mon Sep 17 00:00:00 2001 From: Nate Buttke Date: Wed, 26 Jul 2023 23:40:32 -0700 Subject: today's work. fixed multi-file parsing. don't send too-large files up to API. Generate embeddings. --- setup.py | 71 +++++++++++++++++++++++------------------------------------ setup_cont.py | 16 -------------- 2 files changed, 27 insertions(+), 60 deletions(-) delete mode 100644 setup_cont.py diff --git a/setup.py b/setup.py index 80be931..3003c83 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,3 @@ -import ast from collections import defaultdict import os import pandas as pd @@ -12,18 +11,6 @@ SOURCE_DIR = './' openai.api_key = os.getenv('END_OF_WORLD') -#def get_block(code, node, code_type, file_path): -# """combine a bunch of data about a function. return dictionary""" -# blob = f"{node['pretext']}{ast.get_source_segment(code, node['node'])}" -# return { -# 'code_type': code_type, -# 'source': blob, -# 'start_line': node['node'].lineno, -# 'end_line': node['node'].end_lineno, -# 'chars': len(blob), -# 'file_path': file_path -# } - def ts_query(lang, tree, sexp): query = lang.query(sexp) return query.captures(tree.root_node) @@ -36,7 +23,7 @@ def ts_get_all_code_blocks(lang, code_blocks, file_path, tree, code): results = ts_query(lang, tree, """(function_declaration) @function""") results += ts_query(lang, tree, """(method_declaration) @method""") - # TODO something like list comprehension here + # TODO something like list comprehension here? for r in results: return_dict = { 'code_type': r[1], @@ -48,24 +35,6 @@ def ts_get_all_code_blocks(lang, code_blocks, file_path, tree, code): } code_blocks.append(return_dict) - -def ts_get_all_code_blocks_old(code_blocks, file_path, node): - """Use treesitter to get all code blocks""" - #dict has'code_type' 'source' 'start_line' 'end_line' 'chars' 'file_path' - #print('HERRO', type(node)) - for child in node.children: - #print(type(child), child) - return_dict = { - 'code_type': child.type, - 'start_line': child.start_point[0], - 'end_line': child.end_point[0], - 'chars': child.end_byte - child.start_byte, - 'file_path': file_path - } - code_blocks.append(return_dict) - #if child.type != "function_definition" and len(child.children) - ts_get_all_code_blocks(code_blocks, file_path, child) - def parse_file(file_path): """take source code file and return pd dataframe""" # read file @@ -109,18 +78,27 @@ def parse_file(file_path): return df -def get_files_to_parse(root_path, files_extensions_to_parse=['py'], dirs_to_ignore=['tests']) -> list: +def get_files_to_parse(root_path, files_extensions_to_parse=['go'], dirs_to_ignore=['tests', 'vendor', 'unix']) -> list: """get all source file paths as list.""" files_to_parse = [] - for root, dirs, files in os.walk(SOURCE_DIR): + for root, dirs, files in os.walk(root_path): + # there is probably a better way to do this + # https://stackoverflow.com/questions/13454164/os-walk-without-hidden-folders + files = [f for f in files if not f[0] == '.'] + dirs[:] = [d for d in dirs if (not d[0] == '.') and (set(d.split()).isdisjoint(dirs_to_ignore))] for name in files: - if (root.rsplit("/", 1)[-1] in dirs_to_ignore) or (name.rsplit('.')[-1] not in files_extensions_to_parse): + #if (dirfix(root).rsplit("/", 1)[-1] in dirs_to_ignore) or (name in dirs_to_ignore) or (name.rsplit('.')[-1] not in files_extensions_to_parse): + if (name.rsplit('.')[-1] not in files_extensions_to_parse): continue temp_path = os.path.join(root, name) files_to_parse.append(temp_path) return files_to_parse def generate_summary(prompt): + enc = tiktoken.encoding_for_model("text-davinci-003") + if (len(enc.encode(prompt)) > 2500): + return "too long to summarize." + prompt = prompt + '\nSummarize the above code: ' response = openai.Completion.create( model="text-davinci-003", @@ -134,25 +112,30 @@ def generate_summary(prompt): ) return response["choices"][0]["text"] + # nate function to create blob. the blob just contains the file path and the source code. def blobify(pandaSeries): return f"file path: {pandaSeries['file_path']}\n {pandaSeries['source']}" ### doing stuff!! +code_df = pd.DataFrame() +for file in get_files_to_parse("../../dirserver/src/dirserver/"): + code_df = pd.concat([code_df, parse_file(file)]) -df = parse_file("../../dirserver/src/dirserver/fdpoller.go") -df.to_csv('test.csv') -df["blob"] = df.apply(lambda x: blobify(x),axis=1) - -print(type(df)) -print(df) +code_df["blob"] = code_df.apply(lambda x: blobify(x),axis=1) +print(type(code_df)) +print(code_df) -df.to_csv('test_with_blob.csv') +code_df.to_csv('test_with_blob.csv') print('startng to generate summary') -df["summary"] = df.blob.apply(lambda x: generate_summary(x)) +code_df["summary"] = code_df.blob.apply(lambda x: generate_summary(x)) print('done with generate summary') -df.to_csv('test_with_summary.csv') +print('generating embeddings') +embedding_model = "text-embedding-ada-002" +code_df["embedding_summary"] = code_df.summary.apply([lambda x: get_embedding(x, engine=embedding_model)]) +print('done with generating embeddings') +code_df.to_csv('test_with_summary_and_embeddings.csv') diff --git a/setup_cont.py b/setup_cont.py deleted file mode 100644 index 360c9f9..0000000 --- a/setup_cont.py +++ /dev/null @@ -1,16 +0,0 @@ -import ast -from collections import defaultdict -import os -import pandas as pd -import openai -import tiktoken -from openai.embeddings_utils import get_embedding, cosine_similarity - -openai.api_key = os.getenv('END_OF_WORLD') - -df=pd.read_csv("setup_dataWithSummary.csv") -embedding_model = "text-embedding-ada-002" -df["embedding_summary"] = df.summary.apply([lambda x: get_embedding(x, engine=embedding_model)]) -print(df) - -df.to_csv('setup_dataWithSummaryEmbed.csv') -- cgit v1.2.3