Read/export data file (CSV, Excel, MySQL, Postgres, JSON, Shape ...)
Read and display an Excel file
[code]import pandas as pd
from tabulate import tabulate
df = pd.read_excel('C:/Users/Your file.xlsx',
sheet_name='Export', engine='openpyxl', usecols=['My first column', 'My second column'])
print(tabulate(df.head(35), headers='keys', tablefmt='psql', showindex=True))[/code]
I set engine='openpyxl' because it is from an XLSX file. sheet_name specifies the sheet to import, specify your columns with usecols.
Then I use tabulate to display properly my data in a Postgres way (tablefmt='psql').
With head(35) I just display the 35 first lines, but the dataframe here named df contains all the data.
If you wan to re-organize the columns, please see the Miscellaneous chapter.
Read a CSV file
[code]import pandas as pd
from tabulate import tabulate
df_subscriber = pd.read_csv(workDirectory+'joo_acymailing_subscriber.csv', sep=';', usecols=['subid', 'source', 'email'])
print(tab(df_subscriber.head(10), headers='keys', tablefmt='psql', showindex=False))[/code]
Add textual column names from a CSV without column names
Use names=[] :
[code]import pandas as pd
from tabulate import tabulate
df = pd.read_csv(inpuMyFilePath, names=['Mon Champ 1', 'Mon Champ 2', 'Mon Champ 3'], sep=',', keep_default_na=False).astype(str)
print(tabulate(df.head(10), headers='keys', tablefmt='psql', showindex=False))[/code]
Read a non-standard Excel file
Sometimes a file has wrong top lines, or you can not trust the future column names.
To read a file from a specific row for example, skipping the last rows.
[code]import pandas as pd
from tabulate import tabulate
df = pd.read_excel(r'C:/Users/Georges/Downloads/test excel.xlsx',
sheet_name='Sheet1', engine='openpyxl', usecols=[0,1], header=5-1, skipfooter=1, names=['id', 'value'])
print(tabulate(df.head(35), headers='keys', tablefmt='psql', showindex=False))[/code]
Or skip a specific line (with skiprows=[...]):
[code]...
df = pd.read_excel(my_file, sheet_name='Feuil1', engine='openpyxl', skiprows=[2])
...[/code]
Use first row as label column
[code]my_labels = df.iloc[0]
df = df[1:]
df.columns = my_labels[/code]
Read from MySQL with sqlalchemy
sqlalchemy is the best way with pandas:
[code]from tabulate import tabulate as tab
import pandas as pd
from sqlalchemy import create_engine
username = 'root'
password = ''
port = 3308
database = 'my_beautiful_db'
host ='localhost'
engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s' % (username, password, host, port, database))
sql = 'SELECT COUNT(*) AS total FROM communes ;'
df = pd.read_sql_query(sql, engine).set_index('total')
print(tab(df.head(5), headers='keys', tablefmt='psql'))
engine.dispose()[/code]
Specify database encoding
[code]engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s?charset=utf8mb4' % (username, password, host, port, database))[/code]
Execute a SQL queries from a file with sqlalchemy
[code]from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
my_engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s?charset=utf8mb4' % (my_username, my_password, my_host, my_port, my_database))
my_sql_file = 'C:/MyQueries.sql'
# LOAD THE SQL SCRIPT
with open(queriesDirectory + my_sql_file, 'r') as sql_file:
sql_script = sql_file.read()
# CREATE SESSION
my_session = sessionmaker(bind=my_engine)
session = my_session()
# START A TRANSACTION
session.begin()
try:
session.execute(text(sql_script))
print('Execution of "' + my_sql_file + '" finished!')
except:
print('We can not execute "' + my_sql_file + '"!')
pass
# VALIDATE THE TRANSACTION
session.commit()
# CLOSE SESSION AND CONNECTION
session.close()
my_engine.dispose()[/code]
Get result from a SQL query with sqlalchemy for one result
[code]engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s' % (userTarget, passwordTarget, hostTarget, portTarget, databaseTarget))
sql = 'SELECT 1 ;'
my_connection = engine.connect()
my_query = my_connection.execute(text(sql)).scalar()[/code]
Read from MySQL with mysqlclient
[code]import MySQLdb
from tabulate import tabulate
import pandas as pd
mydb = MySQLdb.connect(user='root', password='', database='MyDb', host='localhost', port=3306)
query = 'SELECT name FROM users ;'
df = pd.read_sql(query, mydb)
mydb.close()
print(tabulate(df.head(10), headers='keys', tablefmt='psql'))
number = df.shape[0]
print(number)[/code]
Execute a SQL queries from a file with mysqlclient
[code]import MySQLdb
from termcolor import colored
conn = MySQLdb.connect(host='localhost',user='root',password='',database='ma_base',port=3308)
my_sql_file = 'C:/MyQueries.sql'
with open(sqlQueries, 'r', encoding='utf8') as sql_file:
sql_script = str(my_sql_file.read())
try:
cursor = conn.cursor()
cursor.execute(sql_script)
cursor.close()
print(colored('\nOK!', 'green'))
except:
print(colored('\nIssue!', 'red'))
pass[/code]
Read from MySQL with mysql.connector
[code]import mysql.connector as connection
import pandas as pd
from tabulate import tabulate as tab
mydb = connection.connect(host='XXX.XX.XXX.XX', port=3308, database='MyDb',user='Me', passwd='pwdXXX!456', use_pure=True)
try:
query = "SELECT firstname, lastname FROM customer ;"
df = pd.read_sql(query, mydb)
except Exception as e:
mydb.close()
print(str(e))
print(tab(df.head(5), headers='keys', tablefmt='psql'))[/code]
Read from Postgres
[code]import pandas as pd
from tabulate import tabulate
import psycopg2
con = psycopg2.connect(database='mytransformations', user='postgres', password='', host='localhost', port='5432')
print("OK, we are connect to mytransformations")
# DISPLAY WITH PANDAS
df = pd.read_sql("SELECT * FROM myparcels ;", con)
con.close()
df.drop('geom', axis=1, inplace=True)
print('\nResults:')
print(tabulate(df.head(10), headers='keys', tablefmt='psql', showindex=False))
print('Number of records:', df.shape[0])[/code]
Read from a JSON
[code]import pandas as pd
from tabulate import tabulate
import json
import requests
response = requests.get(MON_URL)
my_file = json.loads(response.text)
my_json = my_file['data']
mon_df = pd.DataFrame.from_dict(my_json)[/code]
Read from a shape
Use geopandas:
[code]import geopandas as gpd
from tabulate import tabulate
myShape = 'C:\\Path\\Of\\My\\Shape.shp'
print('\n' + myShape)
df = gpd.read_file(myShape)
df['type'] = df['geometry'].astype(str).str.replace(r' .*|\(.*', '', regex=True)
df = df[['id', 'type', 'geometry']]
print(tabulate(df.head(10), headers='keys', tablefmt='psql', showindex=True))
print(df.shape[0])
MyFieldList = df['id'].drop_duplicates().dropna().sort_values(ascending=False).tolist()
print('\n' + str(MyFieldList))
MyGeomTypeList = df['type'].drop_duplicates().dropna().sort_values(ascending=False).tolist()
print('\n' + str(MyGeomTypeList))[/code]
Export a data-frame in a new Excel file
[code]df.to_excel("C:/Users/Your new file.xlsx", index=False, sheet_name='Tab 1')[/code]
index=False to not get the index column.
Export a data-frame in an existing Excel sheet
Use mode='a' and if_sheet_exists:
[code]from openpyxl import load_workbook
import openpyxl
My_File_Path= 'C:/My/File.xlsx'
with pd.ExcelWriter(My_File_Path, 'openpyxl', mode='a', if_sheet_exists='replace') as writing:
df.to_excel(writing, 'My sheet', index=False)[/code]
Export several dataframes in Excel sheets
If you have several data-frame to export in one Excel file, each in unique sheets, you should open an ExcelWriter. And close it.
[code]writer = pd.ExcelWriter('C:/Users/Your new file.xlsx', engine='xlsxwriter')
df_Country.to_excel(writer, index=False, sheet_name='Country', header=['Country', 'Total'])
df_Emails.to_excel(writer, index=False, sheet_name='Emails', header=['Emails', 'Total'])
writer.save()[/code]
Export several dataframes in the same sheet
[code]df.to_excel(writer, index=False, sheet_name='Registrations', startcol=3, startrow=34, header=['Filed', 'Total', '%'])[/code]
Export a dataframe as a txt file, with separator and ANSI encoding
[code]df.to_csv(my_file_import, header=None, index=None, sep='\t', encoding='mbcs')[/code]
Add filters in all columns in one sheet in an Excel file
[code]import openpyxl
from openpyxl.utils import get_column_letter
outputExcelFile = r'C:/User/Your file.xlsx'
workbook = openpyxl.load_workbook(outputExcelFile)
worksheet = workbook['Sheet name']
FullRange = 'A1:' + get_column_letter(worksheet.max_column) + str(worksheet.max_row)
worksheet.auto_filter.ref = FullRange
workbook.save(outputExcelFile)[/code]
Add filters in all columns in all sheets in an Excel file
[code]import openpyxl
from openpyxl.utils import get_column_letter
outputExcelFile = r'C:/User/Your file.xlsx'
workbook = openpyxl.load_workbook(outputExcelFile)
sheetsLits = workbook.sheetnames
for sheet in sheetsLits:
worksheet = workbook[sheet]
FullRange = 'A1:' + get_column_letter(worksheet.max_column) + str(worksheet.max_row)
worksheet.auto_filter.ref = FullRange
workbook.save(outputExcelFile)[/code]
Add colors in all column names in a sheet in an Excel file
[code]import openpyxl
from openpyxl.styles import PatternFill
workbook = openpyxl.load_workbook(My_File_Path)
worksheet = workbook['My sheet']
for cell in worksheet[1]:
worksheet[cell.coordinate].fill = PatternFill(fgColor='FFC6C1C1', fill_type='solid')
workbook.save(My_File_Path)[/code]
Add colors in all column names in all sheets in an Excel file
[code]import openpyxl
from openpyxl.styles import PatternFill
outputExcelFile = r'C:/Users/Your File.xlsx'
workbook = openpyxl.load_workbook(outputExcelFile)
sheetsLits = workbook.sheetnames
for sheet in sheetsLits:
worksheet = workbook[sheet]
for cell in workbook[sheet][1]:
worksheet[cell.coordinate].fill = PatternFill(fgColor = 'FFC6C1C1', fill_type = 'solid')
workbook.save(outputExcelFile)[/code]
Modify a column width in one sheet in an Excel file
[code]import openpyxl
outputExcelFile = r'C:/Users/Your File.xlsx'
workbook = openpyxl.load_workbook(outputExcelFile)
sheetsLits = workbook.sheetnames
workbook['Your sheet'].column_dimensions['A'].width = 75
workbook.save(outputExcelFile)[/code]
Modify all columns width in all sheets in an Excel file
The code below set the A column size to 30, and the others to 10.
[code]import openpyxl
from openpyxl.utils import get_column_letter
outputExcelFile = r'C:/Users/Your File.xlsx'
workbook = openpyxl.load_workbook(outputExcelFile)
sheetsLits = workbook.sheetnames
for sheet in sheetsLits:
for cell in workbook[sheet][1]:
if get_column_letter(cell.column) == 'A':
workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 30
else:
workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 10
workbook.save(outputExcelFile)[/code]
Add color to some line recovering their indexes from condition
[code]list_duplicate = df.index[df['Ligne en doublon'] == 'DOUBLON !'].tolist()
for index in list_duplicate:
for cell in worksheet[index+2]:
worksheet[cell.coordinate].fill = PatternFill(fgColor='FF6666', fill_type='solid')
workbook.save(My_File_Path)[/code]
Freeze the top row of an Excel file
[code]worksheet.freeze_panes = 'A2'[/code]
Freeze the top row and the first column of an Excel file
[code]worksheet.freeze_panes = 'B2'[/code]
Freeze the top row and the 2 first columns of an Excel file
[code]worksheet.freeze_panes = 'C2'[/code]
Change font color and style in an Excel file
[code]from openpyxl.styles import Font
worksheet['A1'].font = Font(color='FFFFFF', italic=False, bold=True)[/code]
Work on several Excel files
Here we freeze the top row, add filters, color and change the width of all columns from 3 Excel files.
[code]import openpyxl
from openpyxl.utils import get_column_letter
from openpyxl.styles import PatternFill
File1 = r'D:/Work/My file 1.xlsx'
File2 = r'D:/Work/My file 2.xlsx'
File3 = r'D:/Work/My file 3.xlsx'
ExcelList = [File1, File2, File3]
for ExcelFile in ExcelList:
workbook = openpyxl.load_workbook(ExcelFile)
sheetsLits = workbook.sheetnames
for sheet in sheetsLits:
worksheet = workbook[sheet]
FullRange = 'A1:' + get_column_letter(worksheet.max_column) + str(worksheet.max_row)
worksheet.auto_filter.ref = FullRange
worksheet.freeze_panes = 'A2'
for cell in workbook[sheet][1]:
worksheet[cell.coordinate].fill = PatternFill(fgColor='FFC6C1C1', fill_type='solid')
if get_column_letter(cell.column) == 'A':
workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 12
else:
workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 20
workbook.save(ExcelFile)[/code]
Insert a picture in an Excel file
And delete them after.
[code]img = openpyxl.drawing.image.Image('C:/Users/myplot.png')
img.anchor = 'E6'
workbook['Your sheet'].add_image(img)
workbook.save(outputExcelFile)
os.remove('C:/Users/myplot.png')[/code]
Open a box to search a file and get his path
Use Tkinter.
[code]from tkinter import Tk
from tkinter.filedialog import askopenfilename
Tk().withdraw()
filename = askopenfilename()
print(filename)[/code]
Open a box to save an Excel file
[code]from tkinter import Tk
from tkinter.filedialog import asksaveasfilename
fileName = asksaveasfilename(filetypes=(("Excel files", "*.xlsx"), ("All files", "*.*")), defaultextension='.xslx')
df.to_excel(fileName, index=False, sheet_name='Tab 1', header=None)[/code]
Check if a file exists
And use it only if it exists:
[code]import os
my_file = 'C:/_gh/0/export.xlsx'
if os.path.isfile(my_file) == True:
df_ExportFeuil1 = pd.read_excel(my_file, sheet_name='Feuil1', engine='openpyxl')[/code]
Rename a file
[code]import os
os.rename('My file.xlsx', 'My file renamed.xlsx')[/code]
Search duplicate columns in Excel
[code]from openpyxl import load_workbook
wb = load_workbook(filename=my_file)
Feuil1 = wb['Feuil1']
# Récupérer les noms de colonnes dans la 3ème ligne
noms_colonnes = [colonne.value for colonne in Feuil1[3]]
print(noms_colonnes)
# Vérifier les doublons dans les noms de colonnes
doublons = set()
noms_en_double = set()
for nom_colonne in noms_colonnes:
if nom_colonne in doublons:
noms_en_double.add(nom_colonne)
else:
doublons.add(nom_colonne)
# Afficher les noms de colonnes en doublons
if noms_en_double:
print(colored('Attention colonne en doublon dans le fichier Excel :', 'red'))
print(colored(str(noms_en_double).replace('{', '').replace('}', ''), 'red'))
sys.exit()
else:
print(colored('OK, pas de colonne en doublon dans le fichier Excel.', 'green'))[/code]