Miscellaneous
Where is Python? Which version?
Some possibilities, for example:
[code]import platform
print('Python version:\n', platform.python_version())
import sys
print('\nWhere is Python:')
print(sys.executable)
import sys
print('\nPython version:')
print(sys.version)[/code]
Create a dataframe from a previous dataframe (copy)
[code]df2 = df1.copy()[/code]
Copy data from a dataframe in the clipboard
Very convenient to copy-paste quickly in a text file, CSV, Excel ...
[code]df.to_clipboard(sep=',', index=False, header=None)[/code]
Get list from field
[code]MyList = df['My field'].drop_duplicates().dropna().sort_values(ascending=True).tolist()
print(MyList)[/code]
Get value from a cell
[code]myValue = df.iloc[0]['my_field'][/code]
Get column names in a list
[code]listColumn = df.columns.tolist()[/code]
Dedupe
To dedupe a dataframe on an Email field keeping only the first duplicate record, use drop_duplicates.
[code]df['Email'] = df['Email'].str.lower()
dfDeduped = df.drop_duplicates(subset=['Email'], keep='first')[/code]
Find duplicates
Use duplicated:
[code]df2 = df1[df1.duplicated(['My field'], keep=False)]
print(tabulate(df2.head(10), headers='keys', tablefmt='psql', showindex=False))[/code]
Find no-duplicates
[code]df2 = df1[~df1.duplicated(['My field'], keep=False)]
print(tabulate(df2.head(10), headers='keys', tablefmt='psql', showindex=False))[/code]
Export a picture from a chart
And add a border.
[code]from PIL import Image, ImageOps
fig1.savefig('C:/Users/myplot.png', dpi=100)
im = Image.open('C:/Users/myplot2.png')
bordered = ImageOps.expand(im, border=1, fill=(0, 0, 0))
bordered.save('C:/Users/myplot2.png')[/code]
Re-organize columns
[code]df = df[['Field 1', 'Adresse mail', 'Note algorithmique', 'Substitutions']][/code]
Rename dataframe
[code]New_df = pd.DataFrame(Old_df)[/code]
Rename columns
From their original name:
[code]df.rename(columns={'Old name 1': 'New name 1', 'Old name 1': 'New name 1'}, inplace=True)[/code]
From their position:
[code]df.columns.values[12] = 'New name'[/code]
Delete column
[code]df.drop('My field', axis=1, inplace=True)[/code]
Format number with space thousand separator
[code]number = df.shape[0]
print(format(number, ',').replace(',', ' '), 'customers')[/code]
Or:
[code]f"{number_of_rows:,}".replace(',', ' ')[/code]
Ranking
To rank according to a field, from highest to lowest:
[code]df['My rank'] = df['My field to rank'].rank(ascending=False)[/code]
Avoid duplicate:
[code]df['My rank'] = df['My field to rank'].rank(ascending=False, method='first')[/code]
Generate a unique random integer
If you want a unique random integer from 1 included to the number of records included.
[code]howManyRecord = df.shape[0]
df['Random number'] = np.random.choice(range(howManyRecord), howManyRecord, replace=False)
df.loc[df['Random number'] == 0, 'Random number'] = howManyRecord[/code]
Select with condition
[code]df = df.loc[df['Field to filter'] == 'Yes'][/code]
Prevent nan
Use keep_default_na=False :
[code]Data = pd.read_excel(inputExcelFile, sheet_name='Feuil1', engine='openpyxl', keep_default_na=False)[/code]
Replace the default NA value
[code]Data = pd.read_excel(inputExcelFile, sheet_name='Feuil1', engine='openpyxl', keep_default_na=False), na_values=['_'])[/code]
Get data like a SQL UPDATE
Very usefull to build SQL update queries from a file with the primary key in first column :
[code]file1 = open(MyPath+'FixQueries.sql', 'w', encoding='cp1252')
# ÉCRITURE DU FICHIER
for i in df.itertuples(index=False):
file1.write('UPDATE MyTable SET ' +\
str(i)[7:-1].split(', ', 1)[-1] +\
" WHERE id = '" + i[0] + "' ;\n")[/code]
Get data like a SQL INSERT
[code]# SQL TO INSERT IN TARGET
for index, row in df.iterrows():
columns = ', '.join(row.index)
values = ', '.join(f"'{v}'" for v in row.values)
query = f'''INSERT IGNORE INTO ma_table ({columns}) VALUES ({values}) ;'''
print(query)[/code]
Use index in concatenation
[code]df.index.astype(str)[/code]
Reset index
[code]df.reset_index(drop=True, inplace=True)[/code]
Drop index
To drop the index, you have to use another field as index.
df = df.set_index('my field')
Open a box to enter a string and use it as a variable
[code]import tkinter as tk
from tkinter import simpledialog
window = tk.Tk()
window.eval('tk::PlaceWindow . center')
window.withdraw()
repertoireCarto = simpledialog.askstring("Saisie", "Répertoire ?\t\t\t\t", parent=window)
print('Le répertoire est', repertoireCarto)[/code]
Get some line indexes according condition
[code]list_duplicate = df.index[df['My field'] == 'Text'].tolist()[/code]
Import external scripts
Your working script needs to know where to find the external scripts:
[code]sys.path.append('C:\\Users\\Georges\\PycharmProjects\\Your_Directory')
from YourScriptWithoutExtension import SomeVariables, ...[/code]
Subprocess
Not related to Pandas but very useful to run shell command, bash, console scripts...
[code]import subprocess
subprocess.run('psql -h localhost -p 5432 -d work -U postgres -c "SELECT NOW() ;"', shell=True)[/code]
And to use pgsql2shp in subprocess for shape export:
[code]subprocess.run( \
'''pgsql2shp -f E:\\Path\\to\\MyShape -u postgres -h localhost work "SELECT * FROM MyTable ;"''', \
shell=False, \
executable='E:\\Path\\to\\pgsql2shp.exe' \
)[/code]
And to use PowerShell or OGR2OGR in subprocess :
[code]subprocess.run( \
'''OGR2OGR...''', \
shell=False, \
executable='E:\\Path\\to\\powershell.exe' \
)[/code]
Strings comparison
Compare the similarity of 2 strings, to get a indicator in a new field:
[code]import difflib
from difflib import SequenceMatcher
df['indicator'] = df[['Field1', 'Field2']].apply(lambda x: SequenceMatcher(lambda y: y == " ", x[0], x[1]).ratio(), axis=1)[/code]
Count time execution
[code]import time
...
print('Time for now is %s secondes' % round(time.process_time(), 1))
timeNow = time.process_time()
...
print('Time now is %s secondes' % round(time.process_time() - timeNow, 1))
timeNow = time.process_time()
...[/code]
Include external Python code (without real importation)
[code]...
YourFileCode = r'C:/Users/Georges/PycharmProjects/WorkEMC/YourFileCode.py'
...
exec(compile(open(YourFileCode, 'rb').read(), YourFileCode, 'exec'))[/code]
Concatenate fields without empty value
[code]df['adresse'] = df[['Filed1', 'Filed2', 'Filed3']].apply(lambda x: ' '.join(x.dropna()), axis=1)[/code]
Force data type creating the dataframe
[code]df = pd.read_csv(workDirectory + FileName,
sep=';',
usecols=fieldsExisting,
dtype={'firstname': 'str', 'lastname': 'str'}
)[/code]
- Tips:
- Sometimes when your data comes from an Excel file and contains percentages,
dtypewill be non-efficient because Excel stores percentages as numbers. There is a solution in the chapter Advanced read/write in Excel of this article, just search Fix Excel percentages.
Fix scientific notation for phone numbers in Pandas, Tabulate and Excel export
Example for phone numbers:
[code]df['Mobile Phone'] = df['Mobile Phone'].astype(str)
df['Mobile Phone'] = df['Mobile Phone'].fillna('')
df['Mobile Phone'] = df['Mobile Phone'].replace(['nan'], '')
df['Mobile Phone'] = df['Mobile Phone'].apply(lambda x: x[:-2] if x.endswith('.0') else x)[/code]
Get file/directory date creation
[code]import os
import time
from time import gmtime, strftime
myRep = 'C://Path/of/my/directory'
myDateCreation = time.ctime(os.path.getctime(myRep))
print('\n' + str(myDateCreation))
myConvertDateCreation = time.strptime(myDateCreation)
print("\nFull:", myConvertDateCreation)
print("\nDay:", myConvertDateCreation.tm_mday)
print("Month:", myConvertDateCreation.tm_mon)
print("Year:", myConvertDateCreation.tm_year)
print("Hour:", myConvertDateCreation.tm_hour)
print("Minutes:", myConvertDateCreation.tm_min)
print(strftime("\n%A %d %B %Y %H:%M", myConvertDateCreation ))[/code]
Use a dataframe not yet created in a function
Pass it as global:
[code]def MyDeletion():
global df
ind_drop = df[df['My field'].apply(lambda x: x == ('My value'))].index
df = df.drop(ind_drop)[/code]
Organize your screens (new)
[code]import subprocess
import pygetwindow as gw
import time
import os
import psutil
# CLOSE PREVIOUS WINDOWS
TARGET = "explorer.exe"
[process.kill() for process in psutil.process_iter() if process.name() == TARGET]
# OPEN A WINDOW
subprocess.Popen('explorer "C:/Users/Downloads"')
# SEE WINDOWS TITLES
print('My windows : ' + str(gw.getAllTitles()))
# MOVE A WINDOW
for window in gw.getAllTitles():
if window == 'Downloads':
explorateur = gw.getWindowsWithTitle(window)[0]
explorateur.resizeTo(500, 505) # width, height
explorateur.moveTo(11, 15) # from left, from right[/code]
Organize your screens (old)
Attention: wait the end of the process for proper screens!
[code]import subprocess
import clipboard
import pyautogui
import pygetwindow as gw
from screeninfo import get_monitors
# Callback function to find file explorer windows
def enum_windows_callback(hwnd, results):
if win32gui.GetClassName(hwnd) == 'CabinetWClass':
results.append(hwnd)
return True
# Get all File Explorer windows
explorer_windows = []
win32gui.EnumWindows(enum_windows_callback, explorer_windows)
# Close every File Explorer window
for hwnd in explorer_windows:
# Activate window
win32gui.ShowWindow(hwnd, win32con.SW_SHOW)
win32gui.SetForegroundWindow(hwnd)
# Use pyautogui to send Alt+F4 key combination to close window
pyautogui.hotkey('alt', 'f4')
# Open File Explorer
subprocess.call("start explorer", shell=True)
# Wait a short while for File Explorer to fully open
pyautogui.sleep(1)
# Select the desired screen (for example, the first screen)
monitors = get_monitors()
screen = monitors[0]# Récupérer les coordonnées du coin supérieur gauche de l'écran
left = screen.x
top = screen.y
# Find the File Explorer window
explorer_window = gw.getWindowsWithTitle('Explorateur de fichiers')[0]
# Desired position on the screen
pyautogui.sleep(0.5)
x_position = 10 # To the right
y_position = 15 # To the down
explorer_window.moveTo(x_position, y_position)
# Size
pyautogui.sleep(0.5)
width = 600
height = 600
explorer_window.resizeTo(width, height)
# Directory
pyautogui.sleep(0.5)
directory_path = 'C:\\Users\\Georges\\Downloads'
clipboard.copy(directory_path)
pyautogui.hotkey('alt', 'a')
# pyautogui.typewrite(directory_path)
pyautogui.hotkey('ctrl', 'v')
pyautogui.press('enter')
# pyautogui.click(button='right')
# subprocess.call("start notepad", shell=True)C:\Users\Georges\Downloads[/code]
Search a string in database
[code]from tabulate import tabulate as tab
import pandas as pd
from sqlalchemy import create_engine
SearchWord = 'anyxxxtube'
username = 'XXXXX'
password = 'XXXXX'
port = 3306
database = 'XXXXX'
host = 'XXXXX'
engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s' % (username, password, host, port, database))
# SEARCH TABLES AND FIELDS
sql = 'SELECT table_name, column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name LIKE "prime_%%" ;'
df = pd.read_sql_query(sql, engine)
print(tab(df.head(5), headers='keys', tablefmt='psql', showindex=False))
print(df.shape[0])
# BUILD QUERIES
Assemblage = list(zip(df['table_name'].tolist(), df['column_name'].tolist()))
# for a, b in Assemblage:
# print(a, ' - ', b)
for a, b in Assemblage:
sql = 'SELECT '' + b + '' FROM '' + a + '' WHERE '' + b + '' LIKE "%%' + SearchWord + '%%" ;'
df = pd.read_sql_query(sql, engine)
if df.shape[0] > 0:
print('In table "' + a + '" :')
print(tab(df.head(5), headers='keys', tablefmt='psql', showindex=False))
print(df.shape[0])[/code]
Name dataframes dynamically
In a loop for exampe, with globals():
[code]ListFiles = ['Stats Name1.xlsx', 'Stats Name2.xlsx']
for f in ListFiles:
dfTemp = pd.read_excel(outputDir + f, sheet_name='Sexe', engine='openpyxl')
name = f.replace('.xlsx', '').replace('Stats ', '')
globals()[name] = dfTemp
print('\n' + name + ' :')
print(tabulate(globals()[name].head(35), headers='keys', tablefmt='psql', showindex=False))[/code]