From 9d34c71ac0aa68d373a57505c4a9d763476cac7a Mon Sep 17 00:00:00 2001 From: Nate Buttke Date: Thu, 27 Jul 2023 17:48:51 -0700 Subject: Move tree-sitter fns into new helper class. Need to make language choice more modular. --- setup.py | 140 +++++++++++++++++++++++++++++++++------------------------------ 1 file changed, 74 insertions(+), 66 deletions(-) diff --git a/setup.py b/setup.py index 3003c83..0b1e1cb 100644 --- a/setup.py +++ b/setup.py @@ -11,71 +11,76 @@ SOURCE_DIR = './' openai.api_key = os.getenv('END_OF_WORLD') -def ts_query(lang, tree, sexp): - query = lang.query(sexp) - return query.captures(tree.root_node) - -def ts_get_all_code_blocks(lang, code_blocks, file_path, tree, code): - """Use treesitter to get all code blocks""" - - # TODO need way to switch between declaration and definition .. - # e.g. golang does not have function definitions according to treesitter - results = ts_query(lang, tree, """(function_declaration) @function""") - results += ts_query(lang, tree, """(method_declaration) @method""") - - # TODO something like list comprehension here? - for r in results: - return_dict = { - 'code_type': r[1], - 'source': code[r[0].start_byte:r[0].end_byte].decode('utf-8'), - 'start_line': r[0].start_point[0], - 'end_line': r[0].end_point[0], - 'chars': r[0].end_byte - r[0].start_byte, - 'file_path': file_path - } - code_blocks.append(return_dict) - -def parse_file(file_path): - """take source code file and return pd dataframe""" - # read file - with open(file_path, 'r') as f: - code = f.read() - - # Tree-Sitter - parser = Parser() - lang = Language("./tree-go.so", "go") - parser.set_language(lang) - tree = parser.parse(bytes(code, "utf8")) - - code_blocks = [] - ts_get_all_code_blocks(lang, code_blocks, file_path, tree, bytes(code, "utf8")) - - - #TODO - # collate imports, assign - collate_types = ['import', 'assign'] - tempblock = None - finblocks = [] - - for block in code_blocks: - if block['code_type'] in collate_types: - if tempblock is None: - tempblock = {k:v for k,v in block.items()} - elif tempblock['code_type'] == block['code_type']: - tempblock['source'] += f"\n{block['source']}" - tempblock['start_line'] = min(tempblock['start_line'], block['start_line']) - tempblock['end_line'] = max(tempblock['start_line'], block['end_line']) - tempblock['chars'] += (block['chars'] + 1) +class TS_Setup_Helper: + parser: Parser + lang: Language + + def __init__(self, ts_object_path, lang_name): + self.parser = Parser() + self.lang = Language(ts_object_path, lang_name) + self.parser.set_language(self.lang) + + def ts_query(self, lang, tree, sexp): + query = lang.query(sexp) + return query.captures(tree.root_node) + + def ts_get_all_code_blocks(self, code_blocks, file_path, tree, code): + """Use treesitter to get all code blocks""" + + # TODO need way to switch between declaration and definition .. + # e.g. golang does not have function definitions according to treesitter + results = self.ts_query(self.lang, tree, """(function_declaration) @function""") + results += self.ts_query(self.lang, tree, """(method_declaration) @method""") + + # TODO something like list comprehension here? + for r in results: + return_dict = { + 'code_type': r[1], + 'source': code[r[0].start_byte:r[0].end_byte].decode('utf-8'), + 'start_line': r[0].start_point[0], + 'end_line': r[0].end_point[0], + 'chars': r[0].end_byte - r[0].start_byte, + 'file_path': file_path + } + code_blocks.append(return_dict) + + def parse_file(self, file_path): + """take source code file and return pd dataframe""" + # read file + with open(file_path, 'r') as f: + code = f.read() + + # Tree-Sitter + tree = self.parser.parse(bytes(code, "utf8")) + + code_blocks = [] + self.ts_get_all_code_blocks(code_blocks, file_path, tree, bytes(code, "utf8")) + + #TODO + # collate imports, assign + collate_types = ['import', 'assign'] + tempblock = None + finblocks = [] + + for block in code_blocks: + if block['code_type'] in collate_types: + if tempblock is None: + tempblock = {k:v for k,v in block.items()} + elif tempblock['code_type'] == block['code_type']: + tempblock['source'] += f"\n{block['source']}" + tempblock['start_line'] = min(tempblock['start_line'], block['start_line']) + tempblock['end_line'] = max(tempblock['start_line'], block['end_line']) + tempblock['chars'] += (block['chars'] + 1) + else: + finblocks.append(tempblock) + tempblock = {k:v for k,v in block.items()} else: - finblocks.append(tempblock) - tempblock = {k:v for k,v in block.items()} - else: - if tempblock is not None: - finblocks.append(tempblock) - tempblock = None - finblocks.append(block) - df = pd.DataFrame(finblocks) - return df + if tempblock is not None: + finblocks.append(tempblock) + tempblock = None + finblocks.append(block) + df = pd.DataFrame(finblocks) + return df def get_files_to_parse(root_path, files_extensions_to_parse=['go'], dirs_to_ignore=['tests', 'vendor', 'unix']) -> list: @@ -119,15 +124,18 @@ def blobify(pandaSeries): ### doing stuff!! +ts_helper = TS_Setup_Helper('./tree-go.so', 'go') + code_df = pd.DataFrame() for file in get_files_to_parse("../../dirserver/src/dirserver/"): - code_df = pd.concat([code_df, parse_file(file)]) + code_df = pd.concat([code_df, ts_helper.parse_file(file)]) code_df["blob"] = code_df.apply(lambda x: blobify(x),axis=1) print(type(code_df)) print(code_df) -code_df.to_csv('test_with_blob.csv') +code_df.to_csv('1test_with_blob.csv') +exit() print('startng to generate summary') code_df["summary"] = code_df.blob.apply(lambda x: generate_summary(x)) -- cgit v1.2.3