from collections import defaultdict
import os
import pandas as pd
import openai 
import tiktoken
from openai.embeddings_utils import get_embedding, cosine_similarity

from tree_sitter import Language, Parser

SOURCE_DIR = './'

openai.api_key = os.getenv('END_OF_WORLD')

class TS_Setup_Helper:
    parser: Parser
    lang: Language

    def __init__(self, ts_object_path, lang_name):
        self.parser = Parser()
        self.lang = Language(ts_object_path, lang_name)
        self.parser.set_language(self.lang)

    def ts_query(self, lang, tree, sexp):
        query = lang.query(sexp)
        return query.captures(tree.root_node)

    def ts_get_all_code_blocks(self, code_blocks, file_path, tree, code):
        """Use treesitter to get all code blocks"""

        # TODO need way to switch between declaration and definition ..
        # e.g. golang does not have function definitions according to treesitter
        results = self.ts_query(self.lang, tree, """(function_declaration) @function""")
        results += self.ts_query(self.lang, tree, """(method_declaration) @method""")

        # TODO something like list comprehension here?
        for r in results:
            return_dict = {
                'code_type': r[1],
                'source': code[r[0].start_byte:r[0].end_byte].decode('utf-8'),
                'start_line': r[0].start_point[0],
                'end_line': r[0].end_point[0],
                'chars': r[0].end_byte - r[0].start_byte,
                'file_path': file_path
            }
            code_blocks.append(return_dict)

    def parse_file(self, file_path):
        """take source code file and return pd dataframe"""
        # read file
        with open(file_path, 'r') as f:
            code = f.read()

        # Tree-Sitter
        tree = self.parser.parse(bytes(code, "utf8"))

        code_blocks = []
        self.ts_get_all_code_blocks(code_blocks, file_path, tree, bytes(code, "utf8"))

        #TODO
        # collate imports, assign
        collate_types = ['import', 'assign']
        tempblock = None
        finblocks = []

        for block in code_blocks:
            if block['code_type'] in collate_types:
                if tempblock is None:
                    tempblock = {k:v for k,v in block.items()}
                elif tempblock['code_type'] == block['code_type']:
                    tempblock['source'] += f"\n{block['source']}"
                    tempblock['start_line'] = min(tempblock['start_line'], block['start_line'])
                    tempblock['end_line'] = max(tempblock['start_line'], block['end_line'])
                    tempblock['chars'] += (block['chars'] + 1)
                else:
                    finblocks.append(tempblock)
                    tempblock = {k:v for k,v in block.items()}
            else:
                if tempblock is not None:
                    finblocks.append(tempblock)
                    tempblock = None
                finblocks.append(block)
        df = pd.DataFrame(finblocks)
        return df


def get_files_to_parse(root_path, files_extensions_to_parse=['go'], dirs_to_ignore=['tests', 'vendor', 'unix']) -> list:
    """get all source file paths as list."""
    files_to_parse = []
    for root, dirs, files in os.walk(root_path):
        # there is probably a better way to do this
        # https://stackoverflow.com/questions/13454164/os-walk-without-hidden-folders
        files = [f for f in files if not f[0] == '.']
        dirs[:] = [d for d in dirs if (not d[0] == '.') and (set(d.split()).isdisjoint(dirs_to_ignore))]
        for name in files:
            #if (dirfix(root).rsplit("/", 1)[-1] in dirs_to_ignore) or (name in dirs_to_ignore) or (name.rsplit('.')[-1] not in files_extensions_to_parse):
            if (name.rsplit('.')[-1] not in files_extensions_to_parse):
                continue
            temp_path = os.path.join(root, name)
            files_to_parse.append(temp_path)
    return files_to_parse

def generate_summary(prompt):
  enc = tiktoken.encoding_for_model("text-davinci-003")
  if (len(enc.encode(prompt)) > 2500):
      return "too long to summarize."

  prompt = prompt + '\nSummarize the above code: '
  response = openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    temperature=0.7,
    max_tokens=1024,
    top_p=1.0,
    frequency_penalty=0.0,
    presence_penalty=0.0,
    stop=["\"\"\""]
  )
  return response["choices"][0]["text"]


# nate function to create blob. the blob just contains the file path and the source code.
def blobify(pandaSeries):
    return f"file path: {pandaSeries['file_path']}\n {pandaSeries['source']}"


### doing stuff!!
ts_helper = TS_Setup_Helper('./tree-go.so', 'go')

code_df = pd.DataFrame()
for file in get_files_to_parse("../../dirserver/src/dirserver/"):
    code_df = pd.concat([code_df, ts_helper.parse_file(file)])

code_df["blob"] = code_df.apply(lambda x: blobify(x),axis=1)
print(type(code_df))
print(code_df)

code_df.to_csv('1test_with_blob.csv')
exit()

print('startng to generate summary')
code_df["summary"] = code_df.blob.apply(lambda x: generate_summary(x))
print('done with generate summary')

print('generating embeddings')
embedding_model = "text-embedding-ada-002"
code_df["embedding_summary"] = code_df.summary.apply([lambda x: get_embedding(x, engine=embedding_model)])
print('done with generating embeddings')

code_df.to_csv('test_with_summary_and_embeddings.csv')