python

关注公众号 jb51net

关闭
首页 > 脚本专栏 > python > Python字符串

Python中字符串格式化、切片与常见陷阱详解

作者:小庄-Python办公

字符串是编程中最常用的数据类型之一,Python 的字符串功能强大且灵活,本文将深入探讨 Python 字符串的各种实战技巧,包括格式化方法、切片操作以及常见的陷阱和解决方案

字符串基础回顾

Python 中的字符串是不可变序列,支持多种创建方式:

# 单引号和双引号(完全等价)
single_quote = 'Hello, World!'
double_quote = "Hello, World!"

# 三引号(支持多行)
multi_line = """这是一个
多行字符串"""

# 原始字符串(不转义特殊字符)
raw_string = r"C:\Users\Documents\file.txt"

# Unicode 字符串
chinese = "你好,世界!"
emoji = "🚀 Python 🐍"

print(f"单引号: {single_quote}")
print(f"多行: {repr(multi_line)}")
print(f"原始: {raw_string}")
print(f"中文: {chinese}")
print(f"表情: {emoji}")

字符串格式化:从旧到新

% 格式化(旧式,不推荐)

name = "张三"
age = 25
height = 1.75

# 基本格式化
print("姓名: %s, 年龄: %d" % (name, age))
print("身高: %.2f 米" % height)

# 格式化字典
data = {"name": "李四", "score": 95.5}
print("学生: %(name)s, 成绩: %(score).1f" % data)

# 对齐和填充
print("|%10s|" % "center")   # 右对齐,宽度10
print("|%-10s|" % "left")    # 左对齐,宽度10
print("|%010d|" % 42)        # 零填充,宽度10

str.format() 方法(过渡方案)

# 基本使用
print("姓名: {}, 年龄: {}".format(name, age))
print("身高: {:.2f} 米".format(height))

# 位置参数
print("{0} 比 {1} 大,但 {1} 比 {0} 小".format(5, 3))

# 关键字参数
print("姓名: {name}, 年龄: {age}".format(name="王五", age=30))

# 混合使用
print("{0} 和 {name} 是朋友".format("张三", name="李四"))

# 高级格式化
total = 1234.5678
print("科学计数法: {:.2e}".format(total))
print("百分比: {:.2%}".format(0.85))
print("千位分隔符: {:,}".format(1234567))

# 对齐和填充
print("|{:>10}|".format("right"))    # 右对齐
print("|{:<10}|".format("left"))     # 左对齐
print("|{:^10}|".format("center"))  # 居中对齐
print("|{:*^10}|".format("center"))  # 居中对齐,*填充

f-string(推荐,Python 3.6+)

# 基本使用(最简洁)
print(f"姓名: {name}, 年龄: {age}")
print(f"身高: {height:.2f} 米")

# 表达式计算
radius = 5
print(f"圆的面积: {3.14159 * radius ** 2:.2f}")

# 函数调用
def get_score():
    return 95.5

print(f"成绩: {get_score():.1f}分")

# 格式化选项
score = 85.6666
print(f"成绩: {score:.1f}")          # 一位小数
print(f"成绩: {score:.0f}")          # 整数
print(f"百分比: {score/100:.2%}")    # 百分比格式

# 对齐和宽度
for i in range(1, 6):
    print(f"第{i:>2}名: {f'学生{i}':<10} 成绩: {95-i*2}")

# 日期时间格式化
from datetime import datetime
now = datetime.now()
print(f"当前时间: {now:%Y-%m-%d %H:%M:%S}")
print(f"日期: {now:%Y年%m月%d日}")

# 调试输出(Python 3.8+)
x = 10
y = 20
print(f"x = {x}, y = {y}, x+y = {x+y}")  # 传统方式
print(f"{x=}, {y=}, {x+y=}")             # 更简洁的调试语法

字符串切片:精准提取

基本切片语法

text = "Python Programming"

# 基本切片 [start:stop:step]
print(text[0:6])     # "Python",从索引0到5
print(text[7:])      # "Programming",从索引7到末尾
print(text[:6])      # "Python",从开始到索引5
print(text[:])       # "Python Programming",完整复制

# 负索引
print(text[-11:])    # "Programming",最后11个字符
print(text[:-12])    # "Python",除了最后12个字符

# 步长
print(text[::2])     # "Pto rgamn",每隔一个字符
print(text[::-1])    # "gnimmargorP nohtyP",反转字符串

高级切片技巧

# 提取文件扩展名
filename = "document.pdf"
extension = filename[-3:] if filename.endswith('pdf') else "其他"
print(f"文件扩展名: {extension}")

# 提取邮箱域名
email = "user@example.com"
domain = email.split('@')[1] if '@' in email else ""
print(f"邮箱域名: {domain}")

# 反转单词
sentence = "Hello World Python"
reversed_words = ' '.join(word[::-1] for word in sentence.split())
print(f"反转单词: {reversed_words}")

# 提取数字
import re
text_with_numbers = "价格: ¥199.99,库存: 50件"
numbers = re.findall(r'\d+\.?\d*', text_with_numbers)
print(f"提取的数字: {numbers}")

# 处理 CSV 数据
csv_line = "张三,25,北京,175.5"
parts = csv_line.split(',')
name, age, city, height = parts
print(f"姓名: {name}, 年龄: {age}, 城市: {city}, 身高: {height}")

切片的高级应用

# 1. 字符串旋转
def rotate_string(s, n):
    """将字符串循环右移 n 位"""
    n = n % len(s)  # 处理 n 大于字符串长度的情况
    return s[-n:] + s[:-n]

text = "abcdef"
print(f"原始: {text}")
print(f"右移2位: {rotate_string(text, 2)}")  # "efabcd"
print(f"右移5位: {rotate_string(text, 5)}")  # "bcdefa"

# 2. 回文检测
def is_palindrome(s):
    """检测回文(忽略大小写和非字母字符)"""
    cleaned = ''.join(c.lower() for c in s if c.isalnum())
    return cleaned == cleaned[::-1]

test_strings = ["racecar", "hello", "A man a plan a canal Panama"]
for s in test_strings:
    print(f"'{s}' 是回文: {is_palindrome(s)}")

# 3. 提取子串的所有位置
def find_all_occurrences(text, pattern):
    """找到所有子串出现的位置"""
    positions = []
    start = 0
    while True:
        pos = text.find(pattern, start)
        if pos == -1:
            break
        positions.append(pos)
        start = pos + 1
    return positions

text = "ababaab"
pattern = "aba"
positions = find_all_occurrences(text, pattern)
print(f"'{pattern}' 在 '{text}' 中的位置: {positions}")

字符串方法大全

查找和替换

text = "Python is awesome, Python is powerful"

# 查找
print(text.find("Python"))      # 0,第一次出现的位置
print(text.rfind("Python"))     # 21,最后一次出现的位置
print(text.find("Java"))        # -1,未找到

print(text.count("Python"))     # 2,出现次数
print(text.index("is"))         # 7,第一次出现位置
print(text.rindex("is"))        # 28,最后一次出现位置

# 替换
new_text = text.replace("Python", "Java")
print(new_text)

# 只替换一次
partial_replace = text.replace("Python", "Java", 1)
print(partial_replace)

大小写转换

# 大小写转换
text = "Python Programming"

print(text.upper())          # "PYTHON PROGRAMMING"
print(text.lower())          # "python programming"
print(text.title())          # "Python Programming"
print(text.capitalize())     # "Python programming"
print(text.swapcase())       # "pYTHON pROGRAMMING"

# 判断大小写
print(text.isupper())        # False
print(text.islower())        # False
print(text.istitle())        # True

# 实际应用:用户名规范化
def normalize_username(username):
    """规范化用户名"""
    # 移除首尾空格,转换为小写
    username = username.strip().lower()
    # 替换空格为下划线
    username = username.replace(" ", "_")
    return username

usernames = ["  John Doe  ", "Jane Smith", "BOB_JOHNSON"]
normalized = [normalize_username(name) for name in usernames]
print("规范化后的用户名:", normalized)

分割和连接

# 分割字符串
text = "apple,banana,orange,grape"
fruits = text.split(",")
print(f"水果列表: {fruits}")

# 限制分割次数
limited_split = text.split(",", 2)
print(f"限制分割2次: {limited_split}")

# 从右边分割
path = "/home/user/documents/file.txt"
parts = path.rsplit("/", 1)
print(f"路径分割: {parts}")

# 按行分割
multiline = """第一行
第二行
第三行"""
lines = multiline.splitlines()
print(f"行列表: {lines}")

# 连接字符串
words = ["Hello", "World", "Python"]
sentence = " ".join(words)
print(f"连接结果: {sentence}")

# 实际应用:路径构建
import os
directory = "home"
subdir = "user"
filename = "document.pdf"
full_path = os.path.join(directory, subdir, filename)
print(f"完整路径: {full_path}")

去除空白字符

# 去除空白字符
text = "   Python Programming   "

print(f"原始: '{text}'")
print(f"去除首尾: '{text.strip()}'")
print(f"去除左侧: '{text.lstrip()}'")
print(f"去除右侧: '{text.rstrip()}'")

# 去除指定字符
custom_text = "***Important***"
print(f"去除*号: '{custom_text.strip('*')}'")

# 实际应用:清理用户输入
def clean_user_input(user_input):
    """清理用户输入"""
    # 去除首尾空白
    cleaned = user_input.strip()
    # 去除多余的内部空白
    cleaned = ' '.join(cleaned.split())
    return cleaned

test_inputs = [
    "  hello   world  ",
    "\tPython\nProgramming\n",
    "   too   much   space   "
]

for inp in test_inputs:
    print(f"原始: '{inp}'")
    print(f"清理: '{clean_user_input(inp)}'")
    print("-" * 30)

判断字符串性质

# 判断字符串性质
test_strings = ["123", "abc", "ABC", "123abc", "", " ", "Python"]

for s in test_strings:
    print(f"字符串: '{s}'")
    print(f"  是否只包含数字: {s.isdigit()}")
    print(f"  是否只包含字母: {s.isalpha()}")
    print(f"  是否只包含字母和数字: {s.isalnum()}")
    print(f"  是否只包含空白: {s.isspace()}")
    print(f"  是否为空: {s == ''}")
    print(f"  是否可打印: {s.isprintable()}")
    print("-" * 20)

# 实际应用:输入验证
def validate_phone_number(phone):
    """验证手机号"""
    # 去除空格和连字符
    cleaned = phone.replace(" ", "").replace("-", "")
    
    # 检查是否为11位数字
    if len(cleaned) == 11 and cleaned.isdigit():
        return True, cleaned
    return False, None

phone_numbers = ["138-1234-5678", "13812345678", "12345", "abc123"]
for phone in phone_numbers:
    valid, cleaned = validate_phone_number(phone)
    print(f"手机号 '{phone}' -> 有效: {valid}, 清理后: {cleaned}")

字符串编码与 Unicode

编码基础

# 字符串编码
text = "你好,世界!"

# 编码为字节
utf8_bytes = text.encode('utf-8')
gbk_bytes = text.encode('gbk')

print(f"原始字符串: {text}")
print(f"UTF-8 编码: {utf8_bytes}")
print(f"GBK 编码: {gbk_bytes}")

# 从字节解码
decoded_utf8 = utf8_bytes.decode('utf-8')
decoded_gbk = gbk_bytes.decode('gbk')

print(f"UTF-8 解码: {decoded_utf8}")
print(f"GBK 解码: {decoded_gbk}")

Unicode 处理

# Unicode 字符
unicode_text = "Hello 世界 🌍"

# 查看字符的 Unicode 编码
for char in unicode_text:
    print(f"字符: {char}, Unicode: U+{ord(char):04X}")

# 创建 Unicode 字符
char1 = chr(65)      # 'A'
char2 = chr(20320)  # '你'
char3 = chr(0x1F30D)  # '🌍'

print(f"字符: {char1}, {char2}, {char3}")

# Unicode 规范化
import unicodedata

# 组合字符
text1 = "café"      # 使用组合字符
text2 = "cafe\u0301"  # 使用基础字符 + 组合重音

print(f"text1: {text1}, text2: {text2}")
print(f"相等: {text1 == text2}")  # False

# 规范化
normalized1 = unicodedata.normalize('NFC', text1)
normalized2 = unicodedata.normalize('NFC', text2)
print(f"规范化后相等: {normalized1 == normalized2}")  # True

常见陷阱和解决方案

字符串不可变性

# ⚠️ 错误:试图修改字符串
text = "Hello"
# text[0] = "h"  # TypeError: 'str' object does not support item assignment

# ✅ 正确:创建新字符串
text = "Hello"
new_text = "h" + text[1:]
print(f"新字符串: {new_text}")

# 大量字符串拼接的性能问题
import time

# 低效方法(大量创建中间字符串)
def slow_concat(n):
    result = ""
    for i in range(n):
        result += str(i)
    return result

# 高效方法(使用列表)
def fast_concat(n):
    parts = []
    for i in range(n):
        parts.append(str(i))
    return "".join(parts)

# 测试性能(使用较小的 n 值)
n = 1000
start = time.time()
slow_result = slow_concat(n)
slow_time = time.time() - start

start = time.time()
fast_result = fast_concat(n)
fast_time = time.time() - start

print(f"慢速方法耗时: {slow_time:.4f}秒")
print(f"快速方法耗时: {fast_time:.4f}秒")
print(f"加速比: {slow_time/fast_time:.2f}倍")

编码错误处理

# ⚠️ 编码错误
try:
    text = "你好"
    encoded = text.encode('ascii')
except UnicodeEncodeError as e:
    print(f"编码错误: {e}")

# ✅ 错误处理策略
text = "你好,世界!"

# 策略1:忽略无法编码的字符
encoded_ignore = text.encode('ascii', errors='ignore')
print(f"忽略错误: {encoded_ignore}")

# 策略2:替换为 ?
encoded_replace = text.encode('ascii', errors='replace')
print(f"替换错误: {encoded_replace}")

# 策略3:使用 XML 实体
encoded_xml = text.encode('ascii', errors='xmlcharrefreplace')
print(f"XML 实体: {encoded_xml}")

# 解码错误处理
bytes_data = b'\xff\xfe'
try:
    decoded = bytes_data.decode('utf-8')
except UnicodeDecodeError as e:
    print(f"解码错误: {e}")
    # 使用错误处理
    decoded = bytes_data.decode('utf-8', errors='replace')
    print(f"替换后: {decoded}")

字符串比较陷阱

# ⚠️ 大小写比较问题
usernames = ["Alice", "alice", "ALICE"]
target = "alice"

# 错误的比较方式
matches = [name for name in usernames if name == target]
print(f"精确匹配: {matches}")  # 只匹配 "alice"

# ✅ 正确的比较方式(不区分大小写)
matches_casefold = [name for name in usernames if name.casefold() == target.casefold()]
print(f"不区分大小写: {matches_casefold}")  # 匹配所有变体

# 更好的方式:使用 casefold()(比 lower() 更强大)
text = "Straße"  # 德语中的 "ss"
print(f"lower(): {text.lower()}")
print(f"casefold(): {text.casefold()}")

# 比较前的规范化
def normalize_string(s):
    """字符串规范化"""
    return s.strip().casefold()

str1 = "  Hello World  "
str2 = "hello world"
print(f"规范化前相等: {str1 == str2}")
print(f"规范化后相等: {normalize_string(str1) == normalize_string(str2)}")

正则表达式陷阱

import re

# ⚠️ 贪婪匹配问题
text = "<div>内容1</div><div>内容2</div>"
pattern_greedy = r"<div>.*</div>"
matches_greedy = re.findall(pattern_greedy, text)
print(f"贪婪匹配: {matches_greedy}")  # 匹配整个字符串

# ✅ 非贪婪匹配
pattern_non_greedy = r"<div>.*?</div>"
matches_non_greedy = re.findall(pattern_non_greedy, text)
print(f"非贪婪匹配: {matches_non_greedy}")  # 分别匹配每个 div

# ⚠️ 特殊字符转义
special_text = "价格: $100.50 (含税)"
pattern_wrong = "$100.50"  # $ 和 . 都是特殊字符
try:
    matches_wrong = re.findall(pattern_wrong, special_text)
except re.error as e:
    print(f"正则表达式错误: {e}")

# ✅ 正确转义
pattern_correct = r"\$100\.50"
matches_correct = re.findall(pattern_correct, special_text)
print(f"正确匹配: {matches_correct}")

# 使用 re.escape() 自动转义
escaped_pattern = re.escape("$100.50")
print(f"自动转义: {escaped_pattern}")

实战项目:文本处理工具

日志分析器

class LogAnalyzer:
    """简单的日志分析器"""
    
    def __init__(self, log_content):
        self.log_content = log_content
        self.log_lines = log_content.splitlines()
    
    def count_errors(self):
        """统计错误数量"""
        error_count = 0
        for line in self.log_lines:
            if "ERROR" in line.upper():
                error_count += 1
        return error_count
    
    def find_errors(self):
        """找出所有错误行"""
        errors = []
        for i, line in enumerate(self.log_lines, 1):
            if "ERROR" in line.upper():
                errors.append((i, line.strip()))
        return errors
    
    def extract_timestamps(self):
        """提取时间戳"""
        import re
        timestamp_pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
        timestamps = re.findall(timestamp_pattern, self.log_content)
        return timestamps
    
    def generate_summary(self):
        """生成日志摘要"""
        total_lines = len(self.log_lines)
        error_count = self.count_errors()
        timestamps = self.extract_timestamps()
        
        summary = f"""
日志摘要:
总条数: {total_lines}
错误数: {error_count}
时间范围: {timestamps[0] if timestamps else '无'} - {timestamps[-1] if timestamps else '无'}
错误率: {(error_count/total_lines)*100:.1f}%
"""
        return summary

# 测试日志分析器
sample_log = """2024-01-01 10:00:01 INFO 系统启动
2024-01-01 10:00:02 DEBUG 加载配置文件
2024-01-01 10:00:03 ERROR 数据库连接失败
2024-01-01 10:00:04 INFO 尝试重新连接
2024-01-01 10:00:05 ERROR 连接超时
2024-01-01 10:00:06 WARNING 内存使用率过高
2024-01-01 10:00:07 INFO 系统正常运行"""

analyzer = LogAnalyzer(sample_log)
print(analyzer.generate_summary())
print("错误详情:")
for line_num, error in analyzer.find_errors():
    print(f"第{line_num}行: {error}")

文本统计工具

class TextStatistics:
    """文本统计工具"""
    
    def __init__(self, text):
        self.text = text
        self.clean_text = self._clean_text()
    
    def _clean_text(self):
        """清理文本,去除标点符号"""
        import re
        # 保留字母、数字、空格和中文
        cleaned = re.sub(r'[^\w\s\u4e00-\u9fff]', '', self.text)
        return cleaned
    
    def word_count(self):
        """统计词数"""
        words = self.clean_text.split()
        return len(words)
    
    def character_count(self):
        """统计字符数(不含空格)"""
        return len(self.clean_text.replace(" ", ""))
    
    def sentence_count(self):
        """统计句子数"""
        import re
        sentences = re.split(r'[.!?。!?]', self.text)
        # 过滤空句子
        sentences = [s.strip() for s in sentences if s.strip()]
        return len(sentences)
    
    def average_word_length(self):
        """平均词长"""
        words = self.clean_text.split()
        if not words:
            return 0
        total_length = sum(len(word) for word in words)
        return total_length / len(words)
    
    def most_common_words(self, n=5):
        """最常见的词"""
        from collections import Counter
        words = self.clean_text.lower().split()
        # 过滤掉太短的词
        words = [word for word in words if len(word) > 2]
        counter = Counter(words)
        return counter.most_common(n)
    
    def readability_score(self):
        """简单的可读性评分"""
        words = self.word_count()
        sentences = self.sentence_count()
        characters = self.character_count()
        
        if sentences == 0 or words == 0:
            return 0
        
        # 平均句长
        avg_sentence_length = words / sentences
        # 平均词长
        avg_word_length = characters / words
        
        # 简单的可读性分数(分数越低越容易阅读)
        score = avg_sentence_length + avg_word_length * 10
        return round(score, 2)
    
    def generate_report(self):
        """生成完整报告"""
        report = f"""
文本统计报告:
================
总字符数: {len(self.text)}
有效字符数: {self.character_count()}
词数: {self.word_count()}
句子数: {self.sentence_count()}
平均词长: {self.average_word_length():.2f}
可读性评分: {self.readability_score()}

最常见的词:
"""
        for word, count in self.most_common_words():
            report += f"  {word}: {count}次\n"
        
        return report

# 测试文本统计
test_text = """
Python 是一种简洁而强大的编程语言。它具有简单易学的语法,
同时提供了丰富的标准库和第三方库。Python 广泛应用于数据科学、
Web开发、人工智能等领域。它的设计哲学强调代码的可读性和简洁性。
"""

stats = TextStatistics(test_text)
print(stats.generate_report())

总结

本文深入探讨了 Python 字符串的各种实战技巧:

字符串处理是编程中的基础技能,掌握这些技巧将帮助你编写更高效、更可靠的文本处理代码。记住:

以上就是Python中字符串格式化、切片与常见陷阱详解的详细内容,更多关于Python字符串的资料请关注脚本之家其它相关文章!

您可能感兴趣的文章:
阅读全文