import ast from collections import defaultdict import os import pandas as pd import openai import tiktoken from openai.embeddings_utils import get_embedding, cosine_similarity from tree_sitter import Language, Parser SOURCE_DIR = './' openai.api_key = os.getenv('END_OF_WORLD') #def get_block(code, node, code_type, file_path): # """combine a bunch of data about a function. return dictionary""" # blob = f"{node['pretext']}{ast.get_source_segment(code, node['node'])}" # return { # 'code_type': code_type, # 'source': blob, # 'start_line': node['node'].lineno, # 'end_line': node['node'].end_lineno, # 'chars': len(blob), # 'file_path': file_path # } def ts_query(lang, tree, sexp): query = lang.query(sexp) return query.captures(tree.root_node) def ts_get_all_code_blocks(lang, code_blocks, file_path, tree, code): """Use treesitter to get all code blocks""" # TODO need way to switch between declaration and definition .. # e.g. golang does not have function definitions according to treesitter results = ts_query(lang, tree, """(function_declaration) @function""") results += ts_query(lang, tree, """(method_declaration) @method""") # TODO something like list comprehension here for r in results: return_dict = { 'code_type': r[1], 'source': code[r[0].start_byte:r[0].end_byte].decode('utf-8'), 'start_line': r[0].start_point[0], 'end_line': r[0].end_point[0], 'chars': r[0].end_byte - r[0].start_byte, 'file_path': file_path } code_blocks.append(return_dict) def ts_get_all_code_blocks_old(code_blocks, file_path, node): """Use treesitter to get all code blocks""" #dict has'code_type' 'source' 'start_line' 'end_line' 'chars' 'file_path' #print('HERRO', type(node)) for child in node.children: #print(type(child), child) return_dict = { 'code_type': child.type, 'start_line': child.start_point[0], 'end_line': child.end_point[0], 'chars': child.end_byte - child.start_byte, 'file_path': file_path } code_blocks.append(return_dict) #if child.type != "function_definition" and len(child.children) ts_get_all_code_blocks(code_blocks, file_path, child) def parse_file(file_path): """take source code file and return pd dataframe""" # read file with open(file_path, 'r') as f: code = f.read() # Tree-Sitter parser = Parser() lang = Language("./tree-go.so", "go") parser.set_language(lang) tree = parser.parse(bytes(code, "utf8")) code_blocks = [] ts_get_all_code_blocks(lang, code_blocks, file_path, tree, bytes(code, "utf8")) #TODO # collate imports, assign collate_types = ['import', 'assign'] tempblock = None finblocks = [] for block in code_blocks: if block['code_type'] in collate_types: if tempblock is None: tempblock = {k:v for k,v in block.items()} elif tempblock['code_type'] == block['code_type']: tempblock['source'] += f"\n{block['source']}" tempblock['start_line'] = min(tempblock['start_line'], block['start_line']) tempblock['end_line'] = max(tempblock['start_line'], block['end_line']) tempblock['chars'] += (block['chars'] + 1) else: finblocks.append(tempblock) tempblock = {k:v for k,v in block.items()} else: if tempblock is not None: finblocks.append(tempblock) tempblock = None finblocks.append(block) df = pd.DataFrame(finblocks) return df def get_files_to_parse(root_path, files_extensions_to_parse=['py'], dirs_to_ignore=['tests']) -> list: """get all source file paths as list.""" files_to_parse = [] for root, dirs, files in os.walk(SOURCE_DIR): for name in files: if (root.rsplit("/", 1)[-1] in dirs_to_ignore) or (name.rsplit('.')[-1] not in files_extensions_to_parse): continue temp_path = os.path.join(root, name) files_to_parse.append(temp_path) return files_to_parse def generate_summary(prompt): prompt = prompt + '\nSummarize the above code: ' response = openai.Completion.create( model="text-davinci-003", prompt=prompt, temperature=0.7, max_tokens=1024, top_p=1.0, frequency_penalty=0.0, presence_penalty=0.0, stop=["\"\"\""] ) return response["choices"][0]["text"] # nate function to create blob. the blob just contains the file path and the source code. def blobify(pandaSeries): return f"file path: {pandaSeries['file_path']}\n {pandaSeries['source']}" ### doing stuff!! df = parse_file("../../dirserver/src/dirserver/fdpoller.go") df.to_csv('test.csv') df["blob"] = df.apply(lambda x: blobify(x),axis=1) print(type(df)) print(df) df.to_csv('test_with_blob.csv') print('startng to generate summary') df["summary"] = df.blob.apply(lambda x: generate_summary(x)) print('done with generate summary') df.to_csv('test_with_summary.csv')