Read/export data file (CSV, Excel, MySQL, Postgres, JSON, Shape ...)

Read and display an Excel file

[code]import pandas as pd
from tabulate import tabulate

df = pd.read_excel('C:/Users/Your file.xlsx',
sheet_name='Export', engine='openpyxl', usecols=['My first column', 'My second column'])

print(tabulate(df.head(35), headers='keys', tablefmt='psql', showindex=True))[/code]

I set engine='openpyxl' because it is from an XLSX file. sheet_name specifies the sheet to import, specify your columns with usecols.

Then I use tabulate to display properly my data in a Postgres way (tablefmt='psql').

With head(35) I just display the 35 first lines, but the dataframe here named df contains all the data.

If you wan to re-organize the columns, please see the Miscellaneous chapter.

Read a CSV file

[code]import pandas as pd
from tabulate import tabulate

df_subscriber = pd.read_csv(workDirectory+'joo_acymailing_subscriber.csv', sep=';', usecols=['subid', 'source', 'email'])

print(tab(df_subscriber.head(10), headers='keys', tablefmt='psql', showindex=False))[/code]

Add textual column names from a CSV without column names

Use names=[] :

[code]import pandas as pd
from tabulate import tabulate

df = pd.read_csv(inpuMyFilePath, names=['Mon Champ 1', 'Mon Champ 2', 'Mon Champ 3'], sep=',', keep_default_na=False).astype(str)

print(tabulate(df.head(10), headers='keys', tablefmt='psql', showindex=False))[/code]

Read a non-standard Excel file

Sometimes a file has wrong top lines, or you can not trust the future column names.

To read a file from a specific row for example, skipping the last rows.

[code]import pandas as pd
from tabulate import tabulate

df = pd.read_excel(r'C:/Users/Georges/Downloads/test excel.xlsx',
sheet_name='Sheet1', engine='openpyxl', usecols=[0,1], header=5-1, skipfooter=1, names=['id', 'value'])

print(tabulate(df.head(35), headers='keys', tablefmt='psql', showindex=False))[/code]

Or skip a specific line (with skiprows=[...]):

[code]...
df = pd.read_excel(my_file, sheet_name='Feuil1', engine='openpyxl', skiprows=[2])
...[/code]

Use first row as label column

[code]my_labels = df.iloc[0]
df = df[1:]
df.columns = my_labels[/code]

Read from MySQL with sqlalchemy

sqlalchemy is the best way with pandas:

[code]from tabulate import tabulate as tab
import pandas as pd
from sqlalchemy import create_engine

username = 'root'
password = ''
port = 3308
database = 'my_beautiful_db'
host ='localhost'

engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s' % (username, password, host, port, database))
sql = 'SELECT COUNT(*) AS total FROM communes ;'
df = pd.read_sql_query(sql, engine).set_index('total')

print(tab(df.head(5), headers='keys', tablefmt='psql'))

engine.dispose()[/code]

Specify database encoding

[code]engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s?charset=utf8mb4' % (username, password, host, port, database))[/code]

Execute a SQL queries from a file with sqlalchemy

[code]from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker

my_engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s?charset=utf8mb4' % (my_username, my_password, my_host, my_port, my_database))

my_sql_file = 'C:/MyQueries.sql'

# LOAD THE SQL SCRIPT
with open(queriesDirectory + my_sql_file, 'r') as sql_file:
sql_script = sql_file.read()

# CREATE SESSION
my_session = sessionmaker(bind=my_engine)
session = my_session()

# START A TRANSACTION
session.begin()

try:
session.execute(text(sql_script))
print('Execution of "' + my_sql_file + '" finished!')
except:
print('We can not execute "' + my_sql_file + '"!')
pass

# VALIDATE THE TRANSACTION
session.commit()

# CLOSE SESSION AND CONNECTION
session.close()
my_engine.dispose()[/code]

Get result from a SQL query with sqlalchemy for one result

[code]engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s' % (userTarget, passwordTarget, hostTarget, portTarget, databaseTarget))
sql = 'SELECT 1 ;'
my_connection = engine.connect()
my_query = my_connection.execute(text(sql)).scalar()[/code]

Read from MySQL with mysqlclient

[code]import MySQLdb
from tabulate import tabulate
import pandas as pd

mydb = MySQLdb.connect(user='root', password='', database='MyDb', host='localhost', port=3306)
query = 'SELECT name FROM users ;'

df = pd.read_sql(query, mydb)
mydb.close()

print(tabulate(df.head(10), headers='keys', tablefmt='psql'))
number = df.shape[0]
print(number)[/code]

Execute a SQL queries from a file with mysqlclient

[code]import MySQLdb
from termcolor import colored

conn = MySQLdb.connect(host='localhost',user='root',password='',database='ma_base',port=3308)

my_sql_file = 'C:/MyQueries.sql'

with open(sqlQueries, 'r', encoding='utf8') as sql_file:
sql_script = str(my_sql_file.read())

try:
cursor = conn.cursor()
cursor.execute(sql_script)
cursor.close()
print(colored('\nOK!', 'green'))
except:
print(colored('\nIssue!', 'red'))
pass[/code]

Read from MySQL with mysql.connector

[code]import mysql.connector as connection
import pandas as pd
from tabulate import tabulate as tab

mydb = connection.connect(host='XXX.XX.XXX.XX', port=3308, database='MyDb',user='Me', passwd='pwdXXX!456', use_pure=True)

try:
query = "SELECT firstname, lastname FROM customer ;"
df = pd.read_sql(query, mydb)
except Exception as e:
mydb.close()
print(str(e))

print(tab(df.head(5), headers='keys', tablefmt='psql'))[/code]

Read from Postgres

[code]import pandas as pd
from tabulate import tabulate
import psycopg2

con = psycopg2.connect(database='mytransformations', user='postgres', password='', host='localhost', port='5432')
print("OK, we are connect to mytransformations")

# DISPLAY WITH PANDAS
df = pd.read_sql("SELECT * FROM myparcels ;", con)
con.close()

df.drop('geom', axis=1, inplace=True)

print('\nResults:')
print(tabulate(df.head(10), headers='keys', tablefmt='psql', showindex=False))
print('Number of records:', df.shape[0])[/code]

Read from a JSON

[code]import pandas as pd
from tabulate import tabulate
import json
import requests

response = requests.get(MON_URL)
my_file = json.loads(response.text)
my_json = my_file['data']

mon_df = pd.DataFrame.from_dict(my_json)[/code]

Read from a shape

Use geopandas:

[code]import geopandas as gpd
from tabulate import tabulate

myShape = 'C:\\Path\\Of\\My\\Shape.shp'

print('\n' + myShape)

df = gpd.read_file(myShape)
df['type'] = df['geometry'].astype(str).str.replace(r' .*|\(.*', '', regex=True)
df = df[['id', 'type', 'geometry']]

print(tabulate(df.head(10), headers='keys', tablefmt='psql', showindex=True))
print(df.shape[0])

MyFieldList = df['id'].drop_duplicates().dropna().sort_values(ascending=False).tolist()
print('\n' + str(MyFieldList))

MyGeomTypeList = df['type'].drop_duplicates().dropna().sort_values(ascending=False).tolist()
print('\n' + str(MyGeomTypeList))[/code]

Export a data-frame in a new Excel file

[code]df.to_excel("C:/Users/Your new file.xlsx", index=False, sheet_name='Tab 1')[/code]

index=False to not get the index column.

Export a data-frame in an existing Excel sheet

Use mode='a' and if_sheet_exists:

[code]from openpyxl import load_workbook
import openpyxl

My_File_Path= 'C:/My/File.xlsx'

with pd.ExcelWriter(My_File_Path, 'openpyxl', mode='a', if_sheet_exists='replace') as writing:
df.to_excel(writing, 'My sheet', index=False)[/code]

Export several dataframes in Excel sheets

If you have several data-frame to export in one Excel file, each in unique sheets, you should open an ExcelWriter. And close it.

[code]writer = pd.ExcelWriter('C:/Users/Your new file.xlsx', engine='xlsxwriter')

df_Country.to_excel(writer, index=False, sheet_name='Country', header=['Country', 'Total'])
df_Emails.to_excel(writer, index=False, sheet_name='Emails', header=['Emails', 'Total'])

writer.save()[/code]

Export several dataframes in the same sheet

[code]df.to_excel(writer, index=False, sheet_name='Registrations', startcol=3, startrow=34, header=['Filed', 'Total', '%'])[/code]

Export a dataframe as a txt file, with separator and ANSI encoding

[code]df.to_csv(my_file_import, header=None, index=None, sep='\t', encoding='mbcs')[/code]

Add filters in all columns in one sheet in an Excel file

[code]import openpyxl
from openpyxl.utils import get_column_letter

outputExcelFile = r'C:/User/Your file.xlsx'
workbook = openpyxl.load_workbook(outputExcelFile)

worksheet = workbook['Sheet name']
FullRange = 'A1:' + get_column_letter(worksheet.max_column) + str(worksheet.max_row)
worksheet.auto_filter.ref = FullRange

workbook.save(outputExcelFile)[/code]

Add filters in all columns in all sheets in an Excel file

[code]import openpyxl
from openpyxl.utils import get_column_letter

outputExcelFile = r'C:/User/Your file.xlsx'

workbook = openpyxl.load_workbook(outputExcelFile)
sheetsLits = workbook.sheetnames

for sheet in sheetsLits:
worksheet = workbook[sheet]
FullRange = 'A1:' + get_column_letter(worksheet.max_column) + str(worksheet.max_row)
worksheet.auto_filter.ref = FullRange
workbook.save(outputExcelFile)[/code]

Add colors in all column names in a sheet in an Excel file

[code]import openpyxl
from openpyxl.styles import PatternFill

workbook = openpyxl.load_workbook(My_File_Path)
worksheet = workbook['My sheet']

for cell in worksheet[1]:
worksheet[cell.coordinate].fill = PatternFill(fgColor='FFC6C1C1', fill_type='solid')
workbook.save(My_File_Path)[/code]

Add colors in all column names in all sheets in an Excel file

[code]import openpyxl
from openpyxl.styles import PatternFill

outputExcelFile = r'C:/Users/Your File.xlsx'

workbook = openpyxl.load_workbook(outputExcelFile)
sheetsLits = workbook.sheetnames

for sheet in sheetsLits:
worksheet = workbook[sheet]
for cell in workbook[sheet][1]:
worksheet[cell.coordinate].fill = PatternFill(fgColor = 'FFC6C1C1', fill_type = 'solid')
workbook.save(outputExcelFile)[/code]

Modify a column width in one sheet in an Excel file

[code]import openpyxl

outputExcelFile = r'C:/Users/Your File.xlsx'

workbook = openpyxl.load_workbook(outputExcelFile)
sheetsLits = workbook.sheetnames

workbook['Your sheet'].column_dimensions['A'].width = 75
workbook.save(outputExcelFile)[/code]

Modify all columns width in all sheets in an Excel file

The code below set the A column size to 30, and the others to 10.

[code]import openpyxl
from openpyxl.utils import get_column_letter

outputExcelFile = r'C:/Users/Your File.xlsx'

workbook = openpyxl.load_workbook(outputExcelFile)
sheetsLits = workbook.sheetnames

for sheet in sheetsLits:
for cell in workbook[sheet][1]:
if get_column_letter(cell.column) == 'A':
workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 30
else:
workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 10

workbook.save(outputExcelFile)[/code]

Add color to some line recovering their indexes from condition

[code]list_duplicate = df.index[df['Ligne en doublon'] == 'DOUBLON !'].tolist()

for index in list_duplicate:
for cell in worksheet[index+2]:
worksheet[cell.coordinate].fill = PatternFill(fgColor='FF6666', fill_type='solid')
workbook.save(My_File_Path)[/code]

Freeze the top row of an Excel file

[code]worksheet.freeze_panes = 'A2'[/code]

Freeze the top row and the first column of an Excel file

[code]worksheet.freeze_panes = 'B2'[/code]

Freeze the top row and the 2 first columns of an Excel file

[code]worksheet.freeze_panes = 'C2'[/code]

Change font color and style in an Excel file

[code]from openpyxl.styles import Font

worksheet['A1'].font = Font(color='FFFFFF', italic=False, bold=True)[/code]

Work on several Excel files

Here we freeze the top row, add filters, color and change the width of all columns from 3 Excel files. 

[code]import openpyxl
from openpyxl.utils import get_column_letter
from openpyxl.styles import PatternFill


File1 = r'D:/Work/My file 1.xlsx'
File2 = r'D:/Work/My file 2.xlsx'
File3 = r'D:/Work/My file 3.xlsx'

ExcelList = [File1, File2, File3]

for ExcelFile in ExcelList:
workbook = openpyxl.load_workbook(ExcelFile)
sheetsLits = workbook.sheetnames

for sheet in sheetsLits:
worksheet = workbook[sheet]
FullRange = 'A1:' + get_column_letter(worksheet.max_column) + str(worksheet.max_row)
worksheet.auto_filter.ref = FullRange
worksheet.freeze_panes = 'A2'
for cell in workbook[sheet][1]:
worksheet[cell.coordinate].fill = PatternFill(fgColor='FFC6C1C1', fill_type='solid')
if get_column_letter(cell.column) == 'A':
workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 12
else:
workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 20
workbook.save(ExcelFile)[/code]

Insert a picture in an Excel file

And delete them after.

[code]img = openpyxl.drawing.image.Image('C:/Users/myplot.png')
img.anchor = 'E6'

workbook['Your sheet'].add_image(img)
workbook.save(outputExcelFile)

os.remove('C:/Users/myplot.png')[/code]

Open a box to search a file and get his path

Use Tkinter.

[code]from tkinter import Tk
from tkinter.filedialog import askopenfilename

Tk().withdraw()
filename = askopenfilename()
print(filename)[/code]

Open a box to save an Excel file

[code]from tkinter import Tk
from tkinter.filedialog import asksaveasfilename

fileName = asksaveasfilename(filetypes=(("Excel files", "*.xlsx"), ("All files", "*.*")), defaultextension='.xslx')
df.to_excel(fileName, index=False, sheet_name='Tab 1', header=None)[/code]

Check if a file exists

And use it only if it exists:

[code]import os

my_file = 'C:/_gh/0/export.xlsx'
if os.path.isfile(my_file) == True:
df_ExportFeuil1 = pd.read_excel(my_file, sheet_name='Feuil1', engine='openpyxl')[/code]

Rename a file

[code]import os
os.rename('My file.xlsx', 'My file renamed.xlsx')[/code]

Search duplicate columns in Excel

[code]from openpyxl import load_workbook

wb = load_workbook(filename=my_file)

Feuil1 = wb['Feuil1']

# Récupérer les noms de colonnes dans la 3ème ligne
noms_colonnes = [colonne.value for colonne in Feuil1[3]]
print(noms_colonnes)

# Vérifier les doublons dans les noms de colonnes
doublons = set()
noms_en_double = set()
for nom_colonne in noms_colonnes:
if nom_colonne in doublons:
noms_en_double.add(nom_colonne)
else:
doublons.add(nom_colonne)

# Afficher les noms de colonnes en doublons
if noms_en_double:
print(colored('Attention colonne en doublon dans le fichier Excel :', 'red'))
print(colored(str(noms_en_double).replace('{', '').replace('}', ''), 'red'))
sys.exit()

else:
print(colored('OK, pas de colonne en doublon dans le fichier Excel.', 'green'))[/code]