Python中字符串格式化、切片与常见陷阱详解
作者:小庄-Python办公
字符串是编程中最常用的数据类型之一,Python 的字符串功能强大且灵活,本文将深入探讨 Python 字符串的各种实战技巧,包括格式化方法、切片操作以及常见的陷阱和解决方案
字符串基础回顾
Python 中的字符串是不可变序列,支持多种创建方式:
# 单引号和双引号(完全等价)
single_quote = 'Hello, World!'
double_quote = "Hello, World!"
# 三引号(支持多行)
multi_line = """这是一个
多行字符串"""
# 原始字符串(不转义特殊字符)
raw_string = r"C:\Users\Documents\file.txt"
# Unicode 字符串
chinese = "你好,世界!"
emoji = "🚀 Python 🐍"
print(f"单引号: {single_quote}")
print(f"多行: {repr(multi_line)}")
print(f"原始: {raw_string}")
print(f"中文: {chinese}")
print(f"表情: {emoji}")
字符串格式化:从旧到新
% 格式化(旧式,不推荐)
name = "张三"
age = 25
height = 1.75
# 基本格式化
print("姓名: %s, 年龄: %d" % (name, age))
print("身高: %.2f 米" % height)
# 格式化字典
data = {"name": "李四", "score": 95.5}
print("学生: %(name)s, 成绩: %(score).1f" % data)
# 对齐和填充
print("|%10s|" % "center") # 右对齐,宽度10
print("|%-10s|" % "left") # 左对齐,宽度10
print("|%010d|" % 42) # 零填充,宽度10
str.format() 方法(过渡方案)
# 基本使用
print("姓名: {}, 年龄: {}".format(name, age))
print("身高: {:.2f} 米".format(height))
# 位置参数
print("{0} 比 {1} 大,但 {1} 比 {0} 小".format(5, 3))
# 关键字参数
print("姓名: {name}, 年龄: {age}".format(name="王五", age=30))
# 混合使用
print("{0} 和 {name} 是朋友".format("张三", name="李四"))
# 高级格式化
total = 1234.5678
print("科学计数法: {:.2e}".format(total))
print("百分比: {:.2%}".format(0.85))
print("千位分隔符: {:,}".format(1234567))
# 对齐和填充
print("|{:>10}|".format("right")) # 右对齐
print("|{:<10}|".format("left")) # 左对齐
print("|{:^10}|".format("center")) # 居中对齐
print("|{:*^10}|".format("center")) # 居中对齐,*填充
f-string(推荐,Python 3.6+)
# 基本使用(最简洁)
print(f"姓名: {name}, 年龄: {age}")
print(f"身高: {height:.2f} 米")
# 表达式计算
radius = 5
print(f"圆的面积: {3.14159 * radius ** 2:.2f}")
# 函数调用
def get_score():
return 95.5
print(f"成绩: {get_score():.1f}分")
# 格式化选项
score = 85.6666
print(f"成绩: {score:.1f}") # 一位小数
print(f"成绩: {score:.0f}") # 整数
print(f"百分比: {score/100:.2%}") # 百分比格式
# 对齐和宽度
for i in range(1, 6):
print(f"第{i:>2}名: {f'学生{i}':<10} 成绩: {95-i*2}")
# 日期时间格式化
from datetime import datetime
now = datetime.now()
print(f"当前时间: {now:%Y-%m-%d %H:%M:%S}")
print(f"日期: {now:%Y年%m月%d日}")
# 调试输出(Python 3.8+)
x = 10
y = 20
print(f"x = {x}, y = {y}, x+y = {x+y}") # 传统方式
print(f"{x=}, {y=}, {x+y=}") # 更简洁的调试语法
字符串切片:精准提取
基本切片语法
text = "Python Programming" # 基本切片 [start:stop:step] print(text[0:6]) # "Python",从索引0到5 print(text[7:]) # "Programming",从索引7到末尾 print(text[:6]) # "Python",从开始到索引5 print(text[:]) # "Python Programming",完整复制 # 负索引 print(text[-11:]) # "Programming",最后11个字符 print(text[:-12]) # "Python",除了最后12个字符 # 步长 print(text[::2]) # "Pto rgamn",每隔一个字符 print(text[::-1]) # "gnimmargorP nohtyP",反转字符串
高级切片技巧
# 提取文件扩展名
filename = "document.pdf"
extension = filename[-3:] if filename.endswith('pdf') else "其他"
print(f"文件扩展名: {extension}")
# 提取邮箱域名
email = "user@example.com"
domain = email.split('@')[1] if '@' in email else ""
print(f"邮箱域名: {domain}")
# 反转单词
sentence = "Hello World Python"
reversed_words = ' '.join(word[::-1] for word in sentence.split())
print(f"反转单词: {reversed_words}")
# 提取数字
import re
text_with_numbers = "价格: ¥199.99,库存: 50件"
numbers = re.findall(r'\d+\.?\d*', text_with_numbers)
print(f"提取的数字: {numbers}")
# 处理 CSV 数据
csv_line = "张三,25,北京,175.5"
parts = csv_line.split(',')
name, age, city, height = parts
print(f"姓名: {name}, 年龄: {age}, 城市: {city}, 身高: {height}")
切片的高级应用
# 1. 字符串旋转
def rotate_string(s, n):
"""将字符串循环右移 n 位"""
n = n % len(s) # 处理 n 大于字符串长度的情况
return s[-n:] + s[:-n]
text = "abcdef"
print(f"原始: {text}")
print(f"右移2位: {rotate_string(text, 2)}") # "efabcd"
print(f"右移5位: {rotate_string(text, 5)}") # "bcdefa"
# 2. 回文检测
def is_palindrome(s):
"""检测回文(忽略大小写和非字母字符)"""
cleaned = ''.join(c.lower() for c in s if c.isalnum())
return cleaned == cleaned[::-1]
test_strings = ["racecar", "hello", "A man a plan a canal Panama"]
for s in test_strings:
print(f"'{s}' 是回文: {is_palindrome(s)}")
# 3. 提取子串的所有位置
def find_all_occurrences(text, pattern):
"""找到所有子串出现的位置"""
positions = []
start = 0
while True:
pos = text.find(pattern, start)
if pos == -1:
break
positions.append(pos)
start = pos + 1
return positions
text = "ababaab"
pattern = "aba"
positions = find_all_occurrences(text, pattern)
print(f"'{pattern}' 在 '{text}' 中的位置: {positions}")
字符串方法大全
查找和替换
text = "Python is awesome, Python is powerful"
# 查找
print(text.find("Python")) # 0,第一次出现的位置
print(text.rfind("Python")) # 21,最后一次出现的位置
print(text.find("Java")) # -1,未找到
print(text.count("Python")) # 2,出现次数
print(text.index("is")) # 7,第一次出现位置
print(text.rindex("is")) # 28,最后一次出现位置
# 替换
new_text = text.replace("Python", "Java")
print(new_text)
# 只替换一次
partial_replace = text.replace("Python", "Java", 1)
print(partial_replace)
大小写转换
# 大小写转换
text = "Python Programming"
print(text.upper()) # "PYTHON PROGRAMMING"
print(text.lower()) # "python programming"
print(text.title()) # "Python Programming"
print(text.capitalize()) # "Python programming"
print(text.swapcase()) # "pYTHON pROGRAMMING"
# 判断大小写
print(text.isupper()) # False
print(text.islower()) # False
print(text.istitle()) # True
# 实际应用:用户名规范化
def normalize_username(username):
"""规范化用户名"""
# 移除首尾空格,转换为小写
username = username.strip().lower()
# 替换空格为下划线
username = username.replace(" ", "_")
return username
usernames = [" John Doe ", "Jane Smith", "BOB_JOHNSON"]
normalized = [normalize_username(name) for name in usernames]
print("规范化后的用户名:", normalized)
分割和连接
# 分割字符串
text = "apple,banana,orange,grape"
fruits = text.split(",")
print(f"水果列表: {fruits}")
# 限制分割次数
limited_split = text.split(",", 2)
print(f"限制分割2次: {limited_split}")
# 从右边分割
path = "/home/user/documents/file.txt"
parts = path.rsplit("/", 1)
print(f"路径分割: {parts}")
# 按行分割
multiline = """第一行
第二行
第三行"""
lines = multiline.splitlines()
print(f"行列表: {lines}")
# 连接字符串
words = ["Hello", "World", "Python"]
sentence = " ".join(words)
print(f"连接结果: {sentence}")
# 实际应用:路径构建
import os
directory = "home"
subdir = "user"
filename = "document.pdf"
full_path = os.path.join(directory, subdir, filename)
print(f"完整路径: {full_path}")
去除空白字符
# 去除空白字符
text = " Python Programming "
print(f"原始: '{text}'")
print(f"去除首尾: '{text.strip()}'")
print(f"去除左侧: '{text.lstrip()}'")
print(f"去除右侧: '{text.rstrip()}'")
# 去除指定字符
custom_text = "***Important***"
print(f"去除*号: '{custom_text.strip('*')}'")
# 实际应用:清理用户输入
def clean_user_input(user_input):
"""清理用户输入"""
# 去除首尾空白
cleaned = user_input.strip()
# 去除多余的内部空白
cleaned = ' '.join(cleaned.split())
return cleaned
test_inputs = [
" hello world ",
"\tPython\nProgramming\n",
" too much space "
]
for inp in test_inputs:
print(f"原始: '{inp}'")
print(f"清理: '{clean_user_input(inp)}'")
print("-" * 30)
判断字符串性质
# 判断字符串性质
test_strings = ["123", "abc", "ABC", "123abc", "", " ", "Python"]
for s in test_strings:
print(f"字符串: '{s}'")
print(f" 是否只包含数字: {s.isdigit()}")
print(f" 是否只包含字母: {s.isalpha()}")
print(f" 是否只包含字母和数字: {s.isalnum()}")
print(f" 是否只包含空白: {s.isspace()}")
print(f" 是否为空: {s == ''}")
print(f" 是否可打印: {s.isprintable()}")
print("-" * 20)
# 实际应用:输入验证
def validate_phone_number(phone):
"""验证手机号"""
# 去除空格和连字符
cleaned = phone.replace(" ", "").replace("-", "")
# 检查是否为11位数字
if len(cleaned) == 11 and cleaned.isdigit():
return True, cleaned
return False, None
phone_numbers = ["138-1234-5678", "13812345678", "12345", "abc123"]
for phone in phone_numbers:
valid, cleaned = validate_phone_number(phone)
print(f"手机号 '{phone}' -> 有效: {valid}, 清理后: {cleaned}")
字符串编码与 Unicode
编码基础
# 字符串编码
text = "你好,世界!"
# 编码为字节
utf8_bytes = text.encode('utf-8')
gbk_bytes = text.encode('gbk')
print(f"原始字符串: {text}")
print(f"UTF-8 编码: {utf8_bytes}")
print(f"GBK 编码: {gbk_bytes}")
# 从字节解码
decoded_utf8 = utf8_bytes.decode('utf-8')
decoded_gbk = gbk_bytes.decode('gbk')
print(f"UTF-8 解码: {decoded_utf8}")
print(f"GBK 解码: {decoded_gbk}")
Unicode 处理
# Unicode 字符
unicode_text = "Hello 世界 🌍"
# 查看字符的 Unicode 编码
for char in unicode_text:
print(f"字符: {char}, Unicode: U+{ord(char):04X}")
# 创建 Unicode 字符
char1 = chr(65) # 'A'
char2 = chr(20320) # '你'
char3 = chr(0x1F30D) # '🌍'
print(f"字符: {char1}, {char2}, {char3}")
# Unicode 规范化
import unicodedata
# 组合字符
text1 = "café" # 使用组合字符
text2 = "cafe\u0301" # 使用基础字符 + 组合重音
print(f"text1: {text1}, text2: {text2}")
print(f"相等: {text1 == text2}") # False
# 规范化
normalized1 = unicodedata.normalize('NFC', text1)
normalized2 = unicodedata.normalize('NFC', text2)
print(f"规范化后相等: {normalized1 == normalized2}") # True
常见陷阱和解决方案
字符串不可变性
# ⚠️ 错误:试图修改字符串
text = "Hello"
# text[0] = "h" # TypeError: 'str' object does not support item assignment
# ✅ 正确:创建新字符串
text = "Hello"
new_text = "h" + text[1:]
print(f"新字符串: {new_text}")
# 大量字符串拼接的性能问题
import time
# 低效方法(大量创建中间字符串)
def slow_concat(n):
result = ""
for i in range(n):
result += str(i)
return result
# 高效方法(使用列表)
def fast_concat(n):
parts = []
for i in range(n):
parts.append(str(i))
return "".join(parts)
# 测试性能(使用较小的 n 值)
n = 1000
start = time.time()
slow_result = slow_concat(n)
slow_time = time.time() - start
start = time.time()
fast_result = fast_concat(n)
fast_time = time.time() - start
print(f"慢速方法耗时: {slow_time:.4f}秒")
print(f"快速方法耗时: {fast_time:.4f}秒")
print(f"加速比: {slow_time/fast_time:.2f}倍")
编码错误处理
# ⚠️ 编码错误
try:
text = "你好"
encoded = text.encode('ascii')
except UnicodeEncodeError as e:
print(f"编码错误: {e}")
# ✅ 错误处理策略
text = "你好,世界!"
# 策略1:忽略无法编码的字符
encoded_ignore = text.encode('ascii', errors='ignore')
print(f"忽略错误: {encoded_ignore}")
# 策略2:替换为 ?
encoded_replace = text.encode('ascii', errors='replace')
print(f"替换错误: {encoded_replace}")
# 策略3:使用 XML 实体
encoded_xml = text.encode('ascii', errors='xmlcharrefreplace')
print(f"XML 实体: {encoded_xml}")
# 解码错误处理
bytes_data = b'\xff\xfe'
try:
decoded = bytes_data.decode('utf-8')
except UnicodeDecodeError as e:
print(f"解码错误: {e}")
# 使用错误处理
decoded = bytes_data.decode('utf-8', errors='replace')
print(f"替换后: {decoded}")
字符串比较陷阱
# ⚠️ 大小写比较问题
usernames = ["Alice", "alice", "ALICE"]
target = "alice"
# 错误的比较方式
matches = [name for name in usernames if name == target]
print(f"精确匹配: {matches}") # 只匹配 "alice"
# ✅ 正确的比较方式(不区分大小写)
matches_casefold = [name for name in usernames if name.casefold() == target.casefold()]
print(f"不区分大小写: {matches_casefold}") # 匹配所有变体
# 更好的方式:使用 casefold()(比 lower() 更强大)
text = "Straße" # 德语中的 "ss"
print(f"lower(): {text.lower()}")
print(f"casefold(): {text.casefold()}")
# 比较前的规范化
def normalize_string(s):
"""字符串规范化"""
return s.strip().casefold()
str1 = " Hello World "
str2 = "hello world"
print(f"规范化前相等: {str1 == str2}")
print(f"规范化后相等: {normalize_string(str1) == normalize_string(str2)}")
正则表达式陷阱
import re
# ⚠️ 贪婪匹配问题
text = "<div>内容1</div><div>内容2</div>"
pattern_greedy = r"<div>.*</div>"
matches_greedy = re.findall(pattern_greedy, text)
print(f"贪婪匹配: {matches_greedy}") # 匹配整个字符串
# ✅ 非贪婪匹配
pattern_non_greedy = r"<div>.*?</div>"
matches_non_greedy = re.findall(pattern_non_greedy, text)
print(f"非贪婪匹配: {matches_non_greedy}") # 分别匹配每个 div
# ⚠️ 特殊字符转义
special_text = "价格: $100.50 (含税)"
pattern_wrong = "$100.50" # $ 和 . 都是特殊字符
try:
matches_wrong = re.findall(pattern_wrong, special_text)
except re.error as e:
print(f"正则表达式错误: {e}")
# ✅ 正确转义
pattern_correct = r"\$100\.50"
matches_correct = re.findall(pattern_correct, special_text)
print(f"正确匹配: {matches_correct}")
# 使用 re.escape() 自动转义
escaped_pattern = re.escape("$100.50")
print(f"自动转义: {escaped_pattern}")
实战项目:文本处理工具
日志分析器
class LogAnalyzer:
"""简单的日志分析器"""
def __init__(self, log_content):
self.log_content = log_content
self.log_lines = log_content.splitlines()
def count_errors(self):
"""统计错误数量"""
error_count = 0
for line in self.log_lines:
if "ERROR" in line.upper():
error_count += 1
return error_count
def find_errors(self):
"""找出所有错误行"""
errors = []
for i, line in enumerate(self.log_lines, 1):
if "ERROR" in line.upper():
errors.append((i, line.strip()))
return errors
def extract_timestamps(self):
"""提取时间戳"""
import re
timestamp_pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
timestamps = re.findall(timestamp_pattern, self.log_content)
return timestamps
def generate_summary(self):
"""生成日志摘要"""
total_lines = len(self.log_lines)
error_count = self.count_errors()
timestamps = self.extract_timestamps()
summary = f"""
日志摘要:
总条数: {total_lines}
错误数: {error_count}
时间范围: {timestamps[0] if timestamps else '无'} - {timestamps[-1] if timestamps else '无'}
错误率: {(error_count/total_lines)*100:.1f}%
"""
return summary
# 测试日志分析器
sample_log = """2024-01-01 10:00:01 INFO 系统启动
2024-01-01 10:00:02 DEBUG 加载配置文件
2024-01-01 10:00:03 ERROR 数据库连接失败
2024-01-01 10:00:04 INFO 尝试重新连接
2024-01-01 10:00:05 ERROR 连接超时
2024-01-01 10:00:06 WARNING 内存使用率过高
2024-01-01 10:00:07 INFO 系统正常运行"""
analyzer = LogAnalyzer(sample_log)
print(analyzer.generate_summary())
print("错误详情:")
for line_num, error in analyzer.find_errors():
print(f"第{line_num}行: {error}")
文本统计工具
class TextStatistics:
"""文本统计工具"""
def __init__(self, text):
self.text = text
self.clean_text = self._clean_text()
def _clean_text(self):
"""清理文本,去除标点符号"""
import re
# 保留字母、数字、空格和中文
cleaned = re.sub(r'[^\w\s\u4e00-\u9fff]', '', self.text)
return cleaned
def word_count(self):
"""统计词数"""
words = self.clean_text.split()
return len(words)
def character_count(self):
"""统计字符数(不含空格)"""
return len(self.clean_text.replace(" ", ""))
def sentence_count(self):
"""统计句子数"""
import re
sentences = re.split(r'[.!?。!?]', self.text)
# 过滤空句子
sentences = [s.strip() for s in sentences if s.strip()]
return len(sentences)
def average_word_length(self):
"""平均词长"""
words = self.clean_text.split()
if not words:
return 0
total_length = sum(len(word) for word in words)
return total_length / len(words)
def most_common_words(self, n=5):
"""最常见的词"""
from collections import Counter
words = self.clean_text.lower().split()
# 过滤掉太短的词
words = [word for word in words if len(word) > 2]
counter = Counter(words)
return counter.most_common(n)
def readability_score(self):
"""简单的可读性评分"""
words = self.word_count()
sentences = self.sentence_count()
characters = self.character_count()
if sentences == 0 or words == 0:
return 0
# 平均句长
avg_sentence_length = words / sentences
# 平均词长
avg_word_length = characters / words
# 简单的可读性分数(分数越低越容易阅读)
score = avg_sentence_length + avg_word_length * 10
return round(score, 2)
def generate_report(self):
"""生成完整报告"""
report = f"""
文本统计报告:
================
总字符数: {len(self.text)}
有效字符数: {self.character_count()}
词数: {self.word_count()}
句子数: {self.sentence_count()}
平均词长: {self.average_word_length():.2f}
可读性评分: {self.readability_score()}
最常见的词:
"""
for word, count in self.most_common_words():
report += f" {word}: {count}次\n"
return report
# 测试文本统计
test_text = """
Python 是一种简洁而强大的编程语言。它具有简单易学的语法,
同时提供了丰富的标准库和第三方库。Python 广泛应用于数据科学、
Web开发、人工智能等领域。它的设计哲学强调代码的可读性和简洁性。
"""
stats = TextStatistics(test_text)
print(stats.generate_report())
总结
本文深入探讨了 Python 字符串的各种实战技巧:
- 字符串格式化:从 % 格式化到 f-string,选择合适的方法让代码更简洁
- 字符串切片:掌握切片语法,能够高效提取和处理字符串片段
- 字符串方法:熟练使用各种内置方法,解决常见的文本处理问题
- 编码处理:理解 Unicode 和编码,避免常见的编码陷阱
- 正则表达式:合理使用正则表达式处理复杂的文本模式
- 性能优化:注意字符串的不可变性,避免低效的字符串拼接
字符串处理是编程中的基础技能,掌握这些技巧将帮助你编写更高效、更可靠的文本处理代码。记住:
- 优先使用 f-string 进行字符串格式化
- 使用切片而不是循环来处理字符串片段
- 注意字符串的不可变性,大量拼接时使用 join()
- 处理用户输入时要进行适当的清理和验证
- 编码问题要尽早处理,避免在后期出现难以调试的错误
以上就是Python中字符串格式化、切片与常见陷阱详解的详细内容,更多关于Python字符串的资料请关注脚本之家其它相关文章!
